# Sentiment Analysis - CountVectorizer and TD-IDF

This notebooks contains the training, evaluation and predictions of a classification for sentiment analysis using CountVectorizer and tf-idf. 

### Imports

In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, accuracy_score

### Read and plot target

In [None]:
df = pd.read_csv('data.csv')
df.head()

In [None]:
df.Sentiment.value_counts()

### Training and experimentation

In [None]:
X_train, x_test, Y_train, y_test = train_test_split( df.Sentence, df.Sentiment, test_size=0.2, random_state=13)
print(f'Train shapes {X_train.shape}, {Y_train.shape}')
print(f'Test shapes {x_test.shape}, {y_test.shape}')

In [None]:
pipeline_1= make_pipeline(CountVectorizer(stop_words='english'),
                          MultinomialNB(),
            )

pipeline_2= make_pipeline(CountVectorizer(stop_words='english'),
                          LogisticRegression(),
            )

pipeline_3 = make_pipeline(CountVectorizer(stop_words='english'),
                           TfidfTransformer(),
                           MultinomialNB())

In [None]:
pipeline_1.fit(X_train, Y_train)
pipeline_2.fit(X_train, Y_train)
pipeline_3.fit(X_train, Y_train)

### Metrics

In [None]:

# Define the pipelines to be evaluated
pipelines = [pipeline_1, pipeline_2, pipeline_3]


# Evaluate each pipeline
for i, model in enumerate(pipelines):
    y_pred_train= model.predict(X_train)
    Y_pred = model.predict(x_test)
    print(f'Pipeline {i+1}: Accuracy train {accuracy_score(Y_train, y_pred_train)}, test {accuracy_score(y_test, Y_pred)} ')
    print('/n')

