In [87]:
import pandas as pd

#### Input (x) --> Comentarios(reviews)
#### Ouput (y) --> Sentimientos

In [88]:
df_review = pd.read_csv('IMDB Dataset.csv')

In [89]:
df_review

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [90]:
df_positivo = df_review[df_review['sentiment']=='positive'][:9000]
df_negativo = df_review[df_review['sentiment']=='negative'][:1000]
df_review_des = pd.concat([df_positivo,df_negativo])

In [91]:
df_review_des

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
...,...,...
2000,Stranded in Space (1972) MST3K version - a ver...,negative
2005,"I happened to catch this supposed ""horror"" fli...",negative
2007,waste of 1h45 this nasty little film is one to...,negative
2010,Warning: This could spoil your movie. Watch it...,negative


In [92]:
df_review_des.value_counts('sentiment')

sentiment
positive    9000
negative    1000
Name: count, dtype: int64

In [93]:
# pip install imbalanced-learn
# conda install -c conda-forge imbalanced-learn
# pip install imblearn

In [94]:
# Dataset desbalanceo
from imblearn.under_sampling import RandomUnderSampler

In [95]:
# con esto balanceamos la data 
rus = RandomUnderSampler()
df_review_bal, df_review_bal['sentiment']  = rus.fit_resample(df_review_des[['review']],
                 df_review_des['sentiment'])

In [96]:
df_review_bal

Unnamed: 0,review,sentiment
3,Basically there's a family where a little boy ...,negative
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
10,Phil the Alien is one of those quirky films wh...,negative
11,I saw this movie when I was about 12 when it c...,negative
...,...,...
7504,I saw this flick on the big screen as a kid an...,positive
11345,This movie is the next segment in the pokemon ...,positive
14253,"First of all, the entire script is mostly impr...",positive
5906,"I just watched this movie for the second time,...",positive


In [97]:
df_review_bal['sentiment']

3        negative
7        negative
8        negative
10       negative
11       negative
           ...   
7504     positive
11345    positive
14253    positive
5906     positive
4230     positive
Name: sentiment, Length: 2000, dtype: object

## Separando data para entrenar(train) y testear(test) 

In [98]:
from sklearn.model_selection import train_test_split

In [99]:
traint, test = train_test_split(df_review_bal,
                                test_size=0.33,random_state=42) 

In [100]:
traint

Unnamed: 0,review,sentiment
144,"I just got back from this free screening, and ...",negative
1839,Claire Denis's movies seem to fall into one of...,negative
6235,"In Arlington Heights, IL we never had a cafete...",positive
742,Mario Lewis of the Competitive Enterprise Inst...,negative
10157,After watching Oldboy I was a little disappoin...,positive
...,...,...
14791,The key to The 40-Year-Old Virgin is not merel...,positive
9130,Just see it! It's a smart movie but too hard t...,positive
1739,"Obviously, the comments above that fawn over t...",negative
12375,Never Been Kissed gives Drew Barrymore the cha...,positive


In [101]:
test

Unnamed: 0,review,sentiment
16237,'Steamboat Willie (1928)' is often erroneously...,positive
681,Never saw the original movie in the series...I...,negative
10393,"ATTENTION, SPOILER!<br /><br />Many people tol...",positive
1821,"At the beginning of the film, you might double...",negative
5962,"Okay, when it comes to plots, this film is far...",positive
...,...,...
207,"I have seen most, if not all of the Laurel & H...",negative
4357,Brokedown Palace is the story of two best frie...,positive
11272,when the gilmore girls started in Germany i di...,positive
1051,I rented this movie with my friend for a good ...,negative


In [108]:
train_x, train_y = traint['review'], traint['sentiment']
test_x, test_y = test['review'], test['sentiment']

# transformar data de texto a data numerica
# representacion de text(Bag of Words)
## 

#### CountVectorizer = frecuencia en q una palabra aparece dentro de 
#### una oracion
#### Tfidf = relevancia de una palabra dentro de una oracion 

###  Count Vectorizer

In [103]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
text = ["Amo escribir codigo en Python. Amo el código en Python",
        "Odio escribir codigo en Java. Odio el código en Java"]

df = pd.DataFrame({'review': ['review1', 'review2'], 'text':text})
cv = CountVectorizer()
cv_matrix = cv.fit_transform(df['text'])
df_dtm = pd.DataFrame(cv_matrix.toarray(), index=df['review'].values,
                      columns=cv.get_feature_names_out())
df_dtm

Unnamed: 0,amo,codigo,código,el,en,escribir,java,odio,python
review1,2,1,1,1,2,1,0,0,2
review2,0,1,1,1,2,1,2,2,0


## Tfidf (term frequency - inverse document frequency)

In [104]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
text = ["Amo escribir codigo en Python. Amo el código en Python",
        "Odio escribir codigo en Java. Odio el código en Java"]

df = pd.DataFrame({'review': ['review1', 'review2'], 'text':text})
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text'])
df_dtm = pd.DataFrame(tfidf_matrix.toarray(), index=df['review'].values, columns=tfidf.get_feature_names_out())
df_dtm

Unnamed: 0,amo,codigo,código,el,en,escribir,java,odio,python
review1,0.576152,0.204969,0.204969,0.204969,0.409937,0.204969,0.0,0.0,0.576152
review2,0.0,0.204969,0.204969,0.204969,0.409937,0.204969,0.576152,0.576152,0.0


## Transformar data de texto a data numerica 

In [109]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
train_x_vector = tfidf.fit_transform(train_x)

test_x_vector = tfidf.transform(test_x)

In [110]:
train_x_vector

<1340x20397 sparse matrix of type '<class 'numpy.float64'>'
	with 116700 stored elements in Compressed Sparse Row format>

### Aprendizaje Supervisado : se usa input para obtener output.
####  # Regresiones : predice valores continuos (output numerico)
####  # Clasificaciones : predice etiquetas de clase discreta (categorias)
###  Aprendizaje No Supervisado : identificar patrones para identificar el output.

## Seleccion del Modelo
## Support Vector Machines (SVM) 

In [119]:
from sklearn.svm import SVC

svc = SVC(kernel='linear')
svc.fit(train_x_vector, train_y)

###  Testeo

In [120]:
print(svc.predict(tfidf.transform(['A good movie']))) #buena pelicula
print(svc.predict(tfidf.transform(['An excellent movie']))) #excelente pelicula
print(svc.predict(tfidf.transform(['"I did not like this movie at all I gave this movie away"'])))# no gusto

['negative']
['positive']
['negative']


### Decision Tree

In [121]:
from sklearn.tree import DecisionTreeClassifier

dec_tree = DecisionTreeClassifier()
dec_tree.fit(train_x_vector, train_y)

### Naive Bayes

In [122]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(train_x_vector.toarray(), train_y)

### Logistic Regression

In [123]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_x_vector, train_y)

## Evaluacion del Modelo 

### Score (Accuracy) Presicion del modelo

In [124]:
print(svc.score(test_x_vector, test_y))
print(dec_tree.score(test_x_vector, test_y))
print(gnb.score(test_x_vector.toarray(), test_y))
print(lr.score(test_x_vector, test_y))

0.8454545454545455
0.6651515151515152
0.6257575757575757
0.8287878787878787


### F1 Score 

F1 Score = 2*(Recall * Precision) / (Recall + Precision)

In [126]:
from sklearn.metrics import f1_score

f1_score(test_y, svc.predict(test_x_vector),
         labels=['positive', 'negative'],
         average=None)

array([0.84911243, 0.84161491])

###  Reporte de Clasificacion

In [127]:
from sklearn.metrics import classification_report

print(classification_report(test_y, svc.predict(test_x_vector),
                      labels=['positive', 'negative']))

              precision    recall  f1-score   support

    positive       0.84      0.86      0.85       335
    negative       0.85      0.83      0.84       325

    accuracy                           0.85       660
   macro avg       0.85      0.85      0.85       660
weighted avg       0.85      0.85      0.85       660



### Confusion Matrix 

In [128]:
from sklearn.metrics import confusion_matrix

confusion_matrix(test_y, svc.predict(test_x_vector),
                 labels=['positive', 'negative'])

array([[287,  48],
       [ 54, 271]], dtype=int64)

###  Optimizacion del Model

## GridSearchCV 

In [131]:
from sklearn.model_selection import GridSearchCV

paremetros = {'C':[1,4,8,16,32], 'kernel':['linear', 'rbf']}
svc = SVC()
svc_grid = GridSearchCV(svc, paremetros, cv=5)
svc_grid.fit(train_x_vector, train_y)

In [130]:
print(svc_grid.best_estimator_)
print(svc_grid.best_params_)

SVC(C=4)
{'C': 4, 'kernel': 'rbf'}


In [None]:
svc_grid.best_score_