## Cargamos las librerías necesarias

In [1]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
import pandas as pd



## Cargamos el set de datos con Pandas

In [3]:
df = pd.read_csv('data/blogtext.csv')

### y visualizamos los primeros 6 registros

In [4]:
df[0:5]

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


## Definimos una submuestra para la prueba de concepto

In [6]:
df2 = pd.DataFrame(df[0:5])

### Lo primero es definir el SIA (Sentiment Intensity Analyzer)
### y el depósito de los resultados
### iteramos sobre cada texto y obtenemos sus scores de polaridad, los cuales guardamos junto con el texto

In [7]:
sia = SIA()
results = []

for n in df2['text']:
    pol = sia.polarity_scores(n)
    pol['text'] = n
    results.append(pol)

## Ahora podemos observar los resultados de la SIA junto con el texto

In [8]:
results = pd.DataFrame(results)           
results

Unnamed: 0,compound,neg,neu,pos,text
0,0.0,0.0,1.0,0.0,"Info has been found (+/- 100 pages,..."
1,0.0,0.0,1.0,0.0,These are the team members: Drewe...
2,-0.8167,0.09,0.814,0.097,In het kader van kernfusie op aarde...
3,0.0,0.0,1.0,0.0,testing!!! testing!!!
4,0.8805,0.0,0.841,0.159,Thanks to Yahoo!'s Toolbar I can ...


In [9]:
results['label'] = 0 # todos los resultados a cero por defecto (neutro)
results.loc[results['compound'] > 0.2, 'label'] = 1 # positivo si > 0.2
results.loc[results['compound'] < -0.2, 'label'] = -1 # negativo si < -0.2
results

Unnamed: 0,compound,neg,neu,pos,text,label
0,0.0,0.0,1.0,0.0,"Info has been found (+/- 100 pages,...",0
1,0.0,0.0,1.0,0.0,These are the team members: Drewe...,0
2,-0.8167,0.09,0.814,0.097,In het kader van kernfusie op aarde...,-1
3,0.0,0.0,1.0,0.0,testing!!! testing!!!,0
4,0.8805,0.0,0.841,0.159,Thanks to Yahoo!'s Toolbar I can ...,1


In [10]:
df = df.join(results, lsuffix='_orig', rsuffix='_res').drop(labels = ['text_res', 'neg', 'neu', 'pos'], axis = 1)
df

Unnamed: 0,id,gender,age,topic,sign,date,text_orig,compound,label
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",0.0000,0.0
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,0.0000,0.0
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,-0.8167,-1.0
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,0.0000,0.0
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,0.8805,1.0
5,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",I had an interesting conversation...,,
6,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Somehow Coca-Cola has a way of su...,,
7,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004","If anything, Korea is a country o...",,
8,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Take a read of this news article ...,,
9,3581210,male,33,InvestmentBanking,Aquarius,"09,June,2004",I surf the English news sites a l...,,
