In [None]:
# This should be the main file.


## Function with preprocess

In [None]:
import pandas as pd

def load_and_preprocess_IMDB(filename, nrows=None):
    """ load the IMDB data and preprocess it:
            - remove html tags
            - remove ponctuation
            - convert to lower case
            - remove stop words
            - remove numbers
            - remove extra spaces
            - replave words with their root form (stem)
            - replace words with their lemma
        :param dataset: 'train' or 'test'
        :param nrows: number of rows to read
        :return: df
    """

    # read the data
    df = pd.read_csv(filename, nrows=nrows)

    # keep a copy of the original review
    df['original_review'] = df['review']

    # remove the html tags
    df['review'] = df['review'].str.replace('<br />', ' ')

    # remove the punctuation and '_' characters
    df['review'] = df['review'].str.replace('[^\w\s]', ' ', regex=True)
    df['review'] = df['review'].str.replace('_', ' ', regex=False)

    # convert to lower case
    df['review'] = df['review'].str.lower()

    # remove the stop words
    from nltk.corpus import stopwords
    stop_words = stopwords.words('english')
    df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

    # remove the numbers - test on https://regexr.com
    df['review'] = df['review'].str.replace('\d+', '', regex=True)

    # remove the extra spaces - test on https://regexr.com
    df['review'] = df['review'].str.replace(' +', ' ', regex=True)

    # replace the words with their root form
    from nltk.stem import SnowballStemmer
    stemmer = SnowballStemmer('english')
    df['review'] = df['review'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

    # replace the words with their lemma
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    df['review'] = df['review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

    return df

### START

In [None]:
df = load_and_preprocess_IMDB('./data/imdb_data_train.zip')
df.head()

### Show the data grouped by sentiment

In [None]:
df.groupby(by='sentiment').count()

### Generate de BOW Matrix

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(lowercase=True, # it should already be in lower case...
                                   stop_words='english', # stop words should already have been removed but ...
                                   ngram_range = (1, 1))

cv.fit(df['review'])
count_vectors_train = cv.transform(df['review'])
count_vectors_train

### Build a dataframe with BoW and add the sentiment column (for an easier visualization)

In [None]:
bow_train = pd.DataFrame(count_vectors_train.toarray(), columns=cv.get_feature_names_out())
bow_train

### Try a simple Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

# this can take a while... +1h on M1
tree = DecisionTreeClassifier(
    max_depth=3, # was 20c
)
tree.fit(bow_train, df['sentiment'])

### Load the test data and pass it through the BOW

In [None]:
df_test = load_and_preprocess_IMDB('./data/imdb_data_test.zip')
count_vectors_test = cv.transform(df_test['review'])