In [36]:
import pandas as pd

#### Input (x) --> Comentarios(reviews)
#### Ouput (y) --> Sentimientos

In [37]:
df_review = pd.read_csv('IMDB Dataset.csv')

In [38]:
df_review

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [39]:
df_positivo = df_review[df_review['sentiment']=='positive'][:9000]
df_negativo = df_review[df_review['sentiment']=='negative'][:1000]
df_review_des = pd.concat([df_positivo,df_negativo])

In [40]:
df_review_des

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
...,...,...
2000,Stranded in Space (1972) MST3K version - a ver...,negative
2005,"I happened to catch this supposed ""horror"" fli...",negative
2007,waste of 1h45 this nasty little film is one to...,negative
2010,Warning: This could spoil your movie. Watch it...,negative


In [41]:
df_review_des.value_counts('sentiment')

sentiment
positive    9000
negative    1000
Name: count, dtype: int64

In [42]:
# pip install imbalanced-learn
# conda install -c conda-forge imbalanced-learn
# pip install imblearn

In [43]:
# Dataset desbalanceo
from imblearn.under_sampling import RandomUnderSampler

In [44]:
# con esto balanceamos la data 
rus = RandomUnderSampler()
df_review_bal, df_review_bal['sentiment']  = rus.fit_resample(df_review_des[['review']],
                 df_review_des['sentiment'])

In [45]:
df_review_bal

Unnamed: 0,review,sentiment
3,Basically there's a family where a little boy ...,negative
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
10,Phil the Alien is one of those quirky films wh...,negative
11,I saw this movie when I was about 12 when it c...,negative
...,...,...
14007,"I have seen ""Miracles Still Happen"" now at lea...",positive
44,"This movie struck home for me. Being 29, I rem...",positive
16516,"A great concept, a great cast, and what a pity...",positive
14994,*** out of ****<br /><br />Yep! Dressed To Kil...,positive


In [46]:
df_review_bal['sentiment']

3        negative
7        negative
8        negative
10       negative
11       negative
           ...   
14007    positive
44       positive
16516    positive
14994    positive
12129    positive
Name: sentiment, Length: 2000, dtype: object

## Separando data para entrenar(train) y testear(test) 

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
traint, test = train_test_split(df_review_bal,
                                test_size=0.33,random_state=42) 

In [49]:
traint

Unnamed: 0,review,sentiment
144,"I just got back from this free screening, and ...",negative
1839,Claire Denis's movies seem to fall into one of...,negative
2560,A wonderfully quirky film with enough twists f...,positive
742,Mario Lewis of the Competitive Enterprise Inst...,negative
2763,I watched it some years ago. I remembered it a...,positive
...,...,...
13357,This is very much not the sort of movie for wh...,positive
3466,"This is not a film is a clever, witty and ofte...",positive
1739,"Obviously, the comments above that fawn over t...",negative
2093,This is a fine musical with a timeless score b...,positive


In [50]:
test

Unnamed: 0,review,sentiment
17598,"Sadly, this movie is relegated to 'curio' stat...",positive
681,Never saw the original movie in the series...I...,negative
10695,"In the film, Lumumba, we see the faces behind ...",positive
1821,"At the beginning of the film, you might double...",negative
8995,"This is a pretty obscure, dumb horror movie se...",positive
...,...,...
207,"I have seen most, if not all of the Laurel & H...",negative
5506,"This film is like ""The Breakfast Club"" meets ""...",positive
5382,Wow! A Danish movie with this kind of content?...,positive
1051,I rented this movie with my friend for a good ...,negative


In [51]:
tain_x, train_y = traint['review'], traint['sentiment']
test_x, test_y = test['review'], test['sentiment']

# transformar data de texto a data numerica
# representacion de text(Bag of Words)
## 

#### CountVectorizer = frecuencia en q una palabra aparece dentro de 
#### una oracion
#### Tfidf = relevancia de una palabra dentro de una oracion 

###  Count Vectorizer

In [66]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
text = ["Amo escribir codigo en Python. Amo el código en Python",
        "Odio escribir codigo en Java. Odio el código en Java"]

df = pd.DataFrame({'review': ['review1', 'review2'], 'text':text})
cv = CountVectorizer()
cv_matrix = cv.fit_transform(df['text'])
df_dtm = pd.DataFrame(cv_matrix.toarray(), index=df['review'].values,
                      columns=cv.get_feature_names_out())
df_dtm

Unnamed: 0,amo,codigo,código,el,en,escribir,java,odio,python
review1,2,1,1,1,2,1,0,0,2
review2,0,1,1,1,2,1,2,2,0


## Tfidf (term frequency - inverse document frequency)

In [65]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
text = ["Amo escribir codigo en Python. Amo el código en Python",
        "Odio escribir codigo en Java. Odio el código en Java"]

df = pd.DataFrame({'review': ['review1', 'review2'], 'text':text})
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text'])
df_dtm = pd.DataFrame(tfidf_matrix.toarray(), index=df['review'].values, columns=tfidf.get_feature_names_out())
df_dtm

Unnamed: 0,amo,codigo,código,el,en,escribir,java,odio,python
review1,0.576152,0.204969,0.204969,0.204969,0.409937,0.204969,0.0,0.0,0.576152
review2,0.0,0.204969,0.204969,0.204969,0.409937,0.204969,0.576152,0.576152,0.0
