# Machine Learning con Scikit-Learn en Python (Clasificacion de Textos)
### Input(x) --> Review
### Output(y) --> Sentiment

# 1.0 Preparando la data

In [30]:
import pandas as pd

In [31]:
df_review = pd.read_csv('IMDB Dataset.csv')
df_review

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [32]:
df_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [33]:
df_review.value_counts('sentiment')

sentiment
negative    25000
positive    25000
dtype: int64

In [34]:
df_positivo = df_review[df_review['sentiment']== 'positive'][:5000] # Agarramos 25k de muestras Positivas
df_negativo = df_review[df_review['sentiment']== 'negative'][:5000] # Agarramos 5k de muestras Negativas
df_review_des = pd.concat([df_positivo, df_negativo]) # Concatenamos los dataset positivos y negativos 
df_review_des.value_counts('sentiment') # Para ver la cantidad de positivos y negativos 

# Esto se haria para trabajar con menos datos.

sentiment
negative    5000
positive    5000
dtype: int64

# Balanceando el Dataset

In [35]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()
df_review_bal, df_review_bal['sentiment'] = rus.fit_resample(df_review_des[['review']], df_review_des['sentiment'])

df_review_bal.value_counts('sentiment')

sentiment
negative    5000
positive    5000
dtype: int64

# 1.1 Separando en (train) y (test) 

In [36]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_review_bal, train_size=0.33, random_state=42) #train_size para dividir los datos en train y test

In [37]:
train_X, train_Y = train['review'], train['sentiment']
test_X, test_Y = test['review'], test['sentiment']

 # 2.0 Representacion de Texto (Bag of Words)

Transformar data numerica a texto 
- CounVectorizer # Cuenta las palabras en numeros enteros. 
- Tfidf # Cuentas las palabras en decimales

# 2.1 Transformar data de texto a data numerica

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf = TfidfVectorizer(stop_words='english') 

train_x_vector = tfidf.fit_transform(train_X) # Lo que hace fit_transform es buscar los mejores parametros  

test_x_vector = tfidf.transform(test_X) # Sacamos el fit porque ya estan calculados los valores optimos.
 
# train_x_vector
# test_x_vector



# 3.0 Seleccion del modelo.
### Aprendizaje Supervizado
Se abren dos ramas en el Aprendizaje Supervizado
--> Regresion (Output numerico),
--> Clasificacion (Output Discreto)  
- Input(Reviews)
- Output(Sentiment)

### Aprendizaje NO Supervizado
En el caso de Aprendizaje NO Supervizado no conocemos el Output en muchas ocaciones.

En este caso vamos a testear cuatro modelos de Clasificacion.
- SVM (Support Vector Machines)
- Arbol de decision (Decision Tree)
- Naivi Bayes
- Regresion Logistica (Logistic Reggresion)

# 3.1  SVM (Support Vector Machines) 

In [39]:
from sklearn.svm import SVC

svc = SVC(kernel='linear')
svc.fit(train_x_vector, train_Y)

### Testeo

In [40]:
print(svc.predict(tfidf.transform(['A good movie'])))
print(svc.predict(tfidf.transform(['An excelent movie'])))
print(svc.predict(tfidf.transform(["I didn't like this movie at all"])))

['positive']
['positive']
['negative']


# 3.2 Arbol de decision

In [43]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(train_x_vector, train_Y)

# 3.3 Naivi Bayes

In [49]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(train_x_vector.toarray(), train_Y)

# 3.4 Regresion Logistica

In [52]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_x_vector, train_Y)

# 4.0 Evaluacion de los modelos
Score (Accuracy)

In [56]:
print(svc.score(test_x_vector, test_Y))
print(dtc.score(test_x_vector, test_Y))
print(gnb.score(test_x_vector.toarray(), test_Y))
print(lr.score(test_x_vector, test_Y))

0.8591044776119403
0.6901492537313433
0.6150746268656716
0.8479104477611941


# 4.1 F1 Score
F1 Score = 2 * (Precisión * Recuperación) / (Precisión * Recuperación)
Si el numero esta cerca de 1 significa que el modelo es bueno y si esta cerca de 0 no es tan bueno


In [59]:
from sklearn.metrics import f1_score

f1_score(test_Y, svc.predict(test_x_vector),
         labels=['positive', 'negative'],
         average=None)

array([0.8618267 , 0.85627284])

# 4.2 Reporte de Clasificacion

In [62]:
from sklearn.metrics import classification_report

print(classification_report(test_Y, svc.predict(test_x_vector),
         labels=['positive', 'negative']))

              precision    recall  f1-score   support

    positive       0.83      0.89      0.86      3294
    negative       0.89      0.83      0.86      3406

    accuracy                           0.86      6700
   macro avg       0.86      0.86      0.86      6700
weighted avg       0.86      0.86      0.86      6700



# 4.3 Confusion Matrix

In [65]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(test_Y, svc.predict(test_x_vector),
         labels=['positive', 'negative']))

[[2944  350]
 [ 594 2812]]


- 2944 Verdaderos Positivos 
- 350 Falsos Positivos
- 594 Falsos Negativos
- 2812 Verdaderos Negativos

# 5.0 Optimizacion del Modelo

## 5.1 GridSearchCV

In [68]:
from sklearn.model_selection import GridSearchCV

parametros = {'C': [1,4,8,16,32], 'kernel':['linear','rbf']} # El valor de C es un termino de error (Cuanto error es soportable) 
# Kernel es parte del sistema que hace los procesamientos, debemos especificar que tipo de funcion usar.
svc = SVC()
svc_grid = GridSearchCV(svc, parametros, cv=5)
svc_grid.fit(train_x_vector, train_Y)


In [69]:
print(svc_grid.best_estimator_)
print(svc_grid.best_params_)

SVC(C=1, kernel='linear')
{'C': 1, 'kernel': 'linear'}


In [76]:
print('El score es:',svc_grid.best_score_)

El score es: 0.8596969696969697
