# Sentiment Analysis

In [23]:
import pandas as pd
import re
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from stop_words import get_stop_words

In [6]:
df=pd.read_csv('Sentiment140.csv')

In [5]:
df.shape

(1600000, 6)

In [13]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


### Funciones

In [18]:
def clean_up(s):
    
    new_word=re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',"",s) #Quitas URLs
    new_word=re.sub('[^A-Za-z0-9]+'," ",new_word) #Quitas carácteres especiales
    new_word=re.sub('\d+',"",new_word) #Quitas números
    new_word=new_word.strip()
    
    return new_word

def tokenize(s):
    return word_tokenize(s) 

def stem_and_lemmatize(s):
    stemmer = SnowballStemmer('english')
    lemmatizer = WordNetLemmatizer()
    
    stemmed = [stemmer.stem(i) for i in s]
    s_l = [lemmatizer.lemmatize(i) for i in stemmed]
    
    return s_l

def remove_stopwords(s):
    stop_words = get_stop_words('english')
    clean_s=[]
    for e in s:
        if e not in stop_words:
            clean_s.append(e)
    
    return clean_s

In [45]:
df['text_processed']=df.text.apply(lambda x: clean_up(x))

df.text_processed=df.text_processed.apply(lambda x: tokenize(x))

df.text_processed=df.text_processed.apply(lambda x: stem_and_lemmatize(x))

df.text_processed=df.text_processed.apply(lambda x: remove_stopwords(x))

In [46]:
df.text_processed.head()

0    [switchfoot, awww, s, bummer, shoulda, got, da...
1    [upset, can, t, updat, facebook, text, might, ...
2    [kenichan, dive, mani, time, ball, manag, save...
3               [whole, bodi, feel, itchi, like, fire]
4    [nationwideclass, s, behav, m, mad, whi, becau...
Name: text_processed, dtype: object

### Words Freqssss

In [48]:
from nltk.probability import FreqDist

### EJEMPLO
sent = 'This is an example sentence'
fdist = FreqDist()

for word in word_tokenize(sent):
    fdist[word.lower()] += 1
    
fdist

FreqDist({'this': 1, 'is': 1, 'an': 1, 'example': 1, 'sentence': 1})

In [81]:
##### Pasando los valores de toda la columna a una sola lista #######

#str(df.text_processed[0])
#df.text_processed[0]

lst=[]
for e in range(len(df)):
        lst += df.text_processed[e]

In [152]:
lst

['switchfoot',
 'awww',
 's',
 'bummer',
 'shoulda',
 'got',
 'david',
 'carr',
 'third',
 'day',
 'd',
 'upset',
 'can',
 't',
 'updat',
 'facebook',
 'text',
 'might',
 'cri',
 'result',
 'school',
 'today',
 'also',
 'blah',
 'kenichan',
 'dive',
 'mani',
 'time',
 'ball',
 'manag',
 'save',
 'rest',
 'go',
 'bound',
 'whole',
 'bodi',
 'feel',
 'itchi',
 'like',
 'fire',
 'nationwideclass',
 's',
 'behav',
 'm',
 'mad',
 'whi',
 'becaus',
 'can',
 't',
 'see',
 'kwesidei',
 'whole',
 'crew',
 'need',
 'hug',
 'loltrish',
 'hey',
 'long',
 'time',
 'see',
 'yes',
 'rain',
 'bit',
 'onli',
 'bit',
 'lol',
 'm',
 'fine',
 'thank',
 's',
 'tatiana',
 'k',
 'nope',
 'didn',
 't',
 'twittera',
 'que',
 'muera',
 'spring',
 'break',
 'plain',
 'citi',
 's',
 'snow',
 'just',
 're',
 'pierc',
 'ear',
 'caregiv',
 'couldn',
 't',
 'bear',
 'watch',
 'thought',
 'ua',
 'loss',
 'wa',
 'embarrass',
 'octolinz',
 'count',
 'idk',
 'whi',
 'either',
 'never',
 'talk',
 'anymor',
 'smarrison',
 

In [158]:
for word in lst:
    fdist[word] += 1

In [159]:
fdist

FreqDist({'s': 573687, 't': 551349, 'm': 421692, 'go': 416088, 'just': 381426, 'get': 332535, 'day': 327798, 'wa': 315099, 'can': 290043, 'now': 282186, ...})

In [160]:
from pandas.io.json import json_normalize
bag_of_words=json_normalize(fdist)

In [161]:
bag_of_words

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaaa,aaaaaaaa,aaaaaaaaa,aaaaaaaaaa,aaaaaaaaaaa,...,zzzzzzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
0,642,360,153,90,66,24,36,21,24,12,...,6,3,3,9,3,3,9,3,3,3


In [162]:
bag_of_words=bag_of_words.sort_values(by=0,axis=1, ascending=False).iloc[:,0:5000]
bag_of_words=bag_of_words.T

In [163]:
lst_top=list(bag_of_words.index)

In [164]:
def find_features(document):
    words = set(document)
    features = {}
    
    for w in lst_top:
        features[w] = (w in words)
    
    return features

In [169]:
find_features(lst) #Obviamente mis top 5000 estan dentro de mi lst del total de palabras

{'s': True,
 't': True,
 'm': True,
 'go': True,
 'just': True,
 'get': True,
 'day': True,
 'wa': True,
 'can': True,
 'now': True,
 'good': True,
 'work': True,
 'like': True,
 'love': True,
 'quot': True,
 'got': True,
 'u': True,
 'today': True,
 'time': True,
 'lol': True,
 'thank': True,
 'miss': True,
 'one': True,
 'back': True,
 'want': True,
 'know': True,
 'will': True,
 'see': True,
 'feel': True,
 'think': True,
 'im': True,
 'don': True,
 'realli': True,
 'amp': True,
 'night': True,
 'hope': True,
 'watch': True,
 'still': True,
 'need': True,
 'make': True,
 'well': True,
 'new': True,
 'na': True,
 'home': True,
 'll': True,
 'oh': True,
 'look': True,
 'ha': True,
 'come': True,
 'much': True,
 'last': True,
 'twitter': True,
 'morn': True,
 're': True,
 'tomorrow': True,
 'wish': True,
 'great': True,
 'wait': True,
 'sad': True,
 'sleep': True,
 'haha': True,
 'bad': True,
 'fun': True,
 'week': True,
 'whi': True,
 'tri': True,
 'right': True,
 'onli': True,
 'foll

In [170]:
df.text_processed # a lo que entiendo, lo que piden es ver las palabras de cada tweet y compararlas con tus top 5000.

#Tendría que modificar la función creada anteriormente.

0          [switchfoot, awww, s, bummer, shoulda, got, da...
1          [upset, can, t, updat, facebook, text, might, ...
2          [kenichan, dive, mani, time, ball, manag, save...
3                     [whole, bodi, feel, itchi, like, fire]
4          [nationwideclass, s, behav, m, mad, whi, becau...
5                                    [kwesidei, whole, crew]
6                                                [need, hug]
7          [loltrish, hey, long, time, see, yes, rain, bi...
8                                [tatiana, k, nope, didn, t]
9                                     [twittera, que, muera]
10                     [spring, break, plain, citi, s, snow]
11                                    [just, re, pierc, ear]
12         [caregiv, couldn, t, bear, watch, thought, ua,...
13         [octolinz, count, idk, whi, either, never, tal...
14         [smarrison, ve, first, didn, t, gun, realli, t...
15         [iamjazzyfizzl, wish, got, watch, miss, iamlil...
16         [holli, death

In [None]:
def find_features(document):
    words = set(document)
    features = {}
    
    for w in lst_top:
        features[w] = (w in words)
    
    return features