In [None]:
# This should be the main file.


## Functions

### Function to pre process FILES

In [1]:
import pandas as pd
import nltk

random_state = 42

def load_and_preprocess_IMDB(filename, nrows=None):
    """ load the IMDB data and preprocess it:
            - remove html tags
            - remove ponctuation
            - convert to lower case
            - remove stop words
            - remove numbers
            - remove extra spaces
            - replave words with their root form (stem)
            - replace words with their lemma
        :param dataset: 'train' or 'test'
        :param nrows: number of rows to read
        :return: df
    """

    # read the data
    df = pd.read_csv(filename, nrows=nrows)

    # keep a copy of the original review
    df['original_review'] = df['review']

    # remove the html tags
    df['review'] = df['review'].str.replace('<br />', ' ')

    # remove the punctuation and '_' characters
    df['review'] = df['review'].str.replace('[^\w\s]', ' ', regex=True)
    df['review'] = df['review'].str.replace('_', ' ', regex=False)

    # convert to lower case
    df['review'] = df['review'].str.lower()

    # remove the stop words
    # nltk.download('stopwords')
    # nltk.download('wordnet')
    from nltk.corpus import stopwords
    
    stop_words = stopwords.words('english')
    df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

    # remove the numbers - test on https://regexr.com
    df['review'] = df['review'].str.replace('\d+', '', regex=True)

    # remove the extra spaces - test on https://regexr.com
    df['review'] = df['review'].str.replace(' +', ' ', regex=True)

    # replace the words with their root form
    from nltk.stem import SnowballStemmer
    stemmer = SnowballStemmer('english')
    df['review'] = df['review'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

    # replace the words with their lemma
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    df['review'] = df['review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

    return df

### Function to pre-process TEXT

In [2]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
      # remove html tags
    text = text.replace('<br />', ' ')

    # remove punctuation and '_'
    for char in ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']:
        text = text.replace(char, ' ')

    # convert to lower case
    text = text.lower()

    # remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # remove numbers
    text = ''.join([char for char in text if not char.isdigit()])

    # remove extra spaces
    text = ' '.join(text.split())

    # replace words with their root form
    stemmer = SnowballStemmer('english')
    text = ' '.join([stemmer.stem(word) for word in text.split()])

    # replace words with their lemma
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    return text

## START

In [3]:
df = load_and_preprocess_IMDB('./data/imdb_data_train.zip')
df.head()

Unnamed: 0,filename,review,classification,sentiment,original_review
0,1821_4.txt,work one best shakespear sourc film manag cred...,4,0,Working with one of the best Shakespeare sourc...
1,10402_1.txt,well tremor origin start found movi quit enjoy...,1,0,"Well...tremors I, the original started off in ..."
2,1062_4.txt,ouch one bit pain sit cute amus premis goe hel...,4,0,Ouch! This one was a bit painful to sit throug...
3,9056_1.txt,seen crappi movi life one must among worst def...,1,0,"I've seen some crappy movies in my life, but t..."
4,5392_3.txt,carrier follow exploit two guy two gal stolen ...,3,0,"""Carriers"" follows the exploits of two guys an..."


### Show the data grouped by sentiment

In [4]:
df.groupby(by='sentiment').count()

Unnamed: 0_level_0,filename,review,classification,original_review
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,12500,12500,12500,12500
1,12500,12500,12500,12500


### Generate de BOW Matrix

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(lowercase=True, # it should already be in lower case...
                                   stop_words='english', # stop words should already have been removed but ...
                                   ngram_range = (1, 1))

cv.fit(df['review'])
count_vectors_train = cv.transform(df['review'])
count_vectors_train

<25000x48815 sparse matrix of type '<class 'numpy.int64'>'
	with 2083655 stored elements in Compressed Sparse Row format>

### Build a dataframe with BoW and add the sentiment column (for an easier visualization)

In [6]:
bow_train = pd.DataFrame(count_vectors_train.toarray(), columns=cv.get_feature_names_out())
bow_train

Unnamed: 0,aa,aaa,aaaaaaah,aaaaah,aaaaatch,aaaahhhhhhh,aaaand,aaaarrgh,aaah,aaargh,...,était,état,étc,éveri,êxtase,ís,ísnt,østbye,über,üvegtigri
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Try a simple Tree

In [8]:
from sklearn.tree import DecisionTreeClassifier

# this can take a while... +1h on M1
tree = DecisionTreeClassifier(
    max_depth=20, # with 3 it takes 2minutes
)
tree.fit(bow_train, df['sentiment'])

### Load the test data and pass it through the BOW

In [7]:
df_test = load_and_preprocess_IMDB('./data/imdb_data_test.zip')
count_vectors_test = cv.transform(df_test['review'])

### Build the BOW Matrix

In [8]:
bow_test = pd.DataFrame(count_vectors_test.toarray(), columns=cv.get_feature_names_out())
bow_test


Unnamed: 0,aa,aaa,aaaaaaah,aaaaah,aaaaatch,aaaahhhhhhh,aaaand,aaaarrgh,aaah,aaargh,...,était,état,étc,éveri,êxtase,ís,ísnt,østbye,über,üvegtigri
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Predict the sentiment

In [11]:
tree.score(bow_test,df_test['sentiment'])

0.7432

### Save the model and the vectorizer

In [21]:
import pickle

# save the model to disk
filename = './models/tree_model.sav'
pickle.dump(tree, open(filename,'wb'))

# save the vectorizer to disk
filename = './vectorizers/count_vectorizer.sav'
pickle.dump(cv, open(filename,'wb'))

            


## Training

### Logistic Regression

In [None]:


# Model building using Bag of words model- Logistic Regression
from sklearn.linear_model import LogisticRegression


logR = LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)

#Fitting the model for the bag of words
logR_Bow=logR.fit(bow_train, df['sentiment'])

### Now do the same but use and ensemble, with tree , NN and Logistic Regression

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score


# Create individual classifiers
tree = DecisionTreeClassifier(max_depth=20, random_state=random_state)
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=100, random_state=random_state)
logR = LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=random_state)

# Create a voting ensemble
ensemble = VotingClassifier(estimators=[('tree', tree), ('mlp', mlp), ('lgr',logR)], voting='hard',n_jobs=-1)

# Train the ensemble
ensemble.fit(bow_train,df['sentiment'])


### Predictions

In [12]:
# Make predictions with the ensemble
ensemble_predictions = ensemble.predict(bow_test)
ensemble_predictions



array([0, 1, 0, ..., 1, 1, 1], dtype=int64)

### Show the Score

In [18]:
score=accuracy_score(df['sentiment'],ensemble_predictions) # best Teachers score = 0.79
score

0.79728

### Save the Model to disk

In [13]:
import pickle

# save the model to disk
filename = './models/ensemble20122023.sav'
pickle.dump(tree, open(filename,'wb'))

## Testing with text!

### Trying just one text

In [18]:

test = preprocess_text("Hello I hate this is horrible!")
# Convert the preprocessed text to a bag-of-words representation using the same vectorizer used during training
cvText = cv.transform([test])
bow_testText = pd.DataFrame(cvText.toarray(), columns=cv.get_feature_names_out())
predictTest = ensemble.predict(bow_testText)

if (predictTest[0]== 1 ) : 
    print("Positivo")
else :
    print ("Negativo")

Negativo


### Trying another text

In [16]:
test = preprocess_text("I love this weather is great!")
# Convert the preprocessed text to a bag-of-words representation using the same vectorizer used during training
cvText = cv.transform([test])
bow_testText = pd.DataFrame(cvText.toarray(), columns=cv.get_feature_names_out())
predictTest = ensemble.predict(bow_testText)

if (predictTest[0]== 1 ) : 
    print("Positivo")
else :
    print ("Negativo")

Positivo


### Evaluate the Emsemble Performance Score (DOESNT WORK)

In [35]:
# Evaluate the ensemble performance
ensemble_accuracy = accuracy_score(bow_test, ensemble_predictions)

ValueError: Classification metrics can't handle a mix of multiclass-multioutput and binary targets

: 