In [None]:
%load_ext autoreload
%autoreload 2

# Sentiment Analysis On Movie Reviews

In this problem, we are given a dataset of movie reviews and the corresponding sentiment (positive or negative). Our goal is to build a model that can classify a movie review as positive or negative based on its text.

Import neccessary modules

In [None]:
import pandas
import numpy as np
import importlib

from classifiers.RandomClassifier import RandomClassifier 
from classifiers.DecisionTreeClassifier import DecisionTreeClassifier
from StatisticManager import StatisticManager

Read and show the head of the training data set

In [None]:
X_train_raw=pandas.read_excel('../data/Dataset2_train/X_train.xlsx')
y_train_raw=pandas.read_excel('../data/Dataset2_train/y_train.xlsx')
X_train_raw.info()
X_train_raw.head()
y_train_raw.head()


- Setup the target feature column name

In [None]:
target="Sentiment"
# target="class"
target_feature=target+"_cat"


## Data Process

- Fill all missing values with the median of that descriptive feature
- Change the target feature to an catagorical feature
- Show the percentage of each classes

In [None]:
X_train_raw.fillna('',inplace=True)


y_train_raw[target_feature]=y_train_raw[target].astype("category")
y_train_raw.drop([target],axis=1,inplace=True)
y_train_raw.info()
y_train_raw[target_feature].value_counts(normalize=True)*100



- Profile the descriptive features of the training dataset

In [None]:

# from pandas_profiling import ProfileReport

# ProfileReport(X_train,title="X_train profiling")
# ProfileReport(y_train,title="y_train profiling")

## Process input data

In [None]:
X_train_raw['len']=X_train_raw['Phrase'].apply(len)
X_train_raw

## Archive
This section in not used for the final result. Please skip it.

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')


In [None]:
# q=X_train_raw.Phrase[4]
# word_tokenize(q)

In [None]:
# stemmer=SnowballStemmer(language='english')


In [None]:
# def tokenize(text):
#     return [token.lower() for token in word_tokenize(text) if token.isalpha()]

In [None]:
# tokenize(q)

In [None]:
# nltk.download('stopwords')
# stopWords=stopwords.words('english')
# ", ".join(stopWords)

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# vectorizer=TfidfVectorizer(tokenizer=tokenize,lowercase=True,stop_words=stopWords,ngram_range=(1,2),max_features=5000)

In [None]:
# inputs=vectorizer.fit_transform(X_train_raw.Phrase)
# inputs.shape

In [None]:
# print(inputs)
# vectorizer.get_feature_names_out()[:100]

In [None]:
# vectorizer.fit(X_train_raw.Phrase)

- Convert all token into lower cases

In [None]:
# vectorizer.vocabulary_
# vectorizer.get_feature_names_out()[:100]

In [None]:
# inputs=vectorizer.transform(X_train_raw.Phrase)

In [None]:
# inputs.shape

## Decision Tree 
- Reference to the demo code

In [None]:
import tqdm
from bs4 import BeautifulSoup
import re
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

def preprocess_data(df):
    reviews = []
    for raw in tqdm.tqdm(df['Phrase']):
        text = BeautifulSoup(raw,'lxml').get_text()
        only_text = re.sub('[^a-zA-Z]', ' ', text)
        words = word_tokenize(only_text.lower())
        stops = set(stopwords.words('english'))
        non_stopwords = [word for word in words if not word in stops]
        lemma_words = [lemmatizer.lemmatize(word) for word in non_stopwords]    
        reviews.append(lemma_words)
    return reviews

def tokenizer_preprocess(list_X_train, list_X_val):
    unique_words = set()
    len_max = 0
    for sent in tqdm.tqdm(list_X_train):
        unique_words.update(sent)
        if len_max < len(sent):
            len_max = len(sent)
    len(list(unique_words)), len_max

    tokenizer = Tokenizer(num_words=len(list(unique_words)))
    tokenizer.fit_on_texts(list(list_X_train))
     
    X_train = tokenizer.texts_to_sequences(list_X_train)
    X_train = sequence.pad_sequences(X_train, maxlen=len_max)

    X_val = tokenizer.texts_to_sequences(list_X_val)
    X_val = sequence.pad_sequences(X_val, maxlen=len_max)

    return X_train, X_val

### Sample data

Since the original dataset is too large, it will take too long to train the model. Thus, we can randomly sample somedata to reduce the training process. However, this may also decrease the performance of the model.

To save your time, it is recommanded to use 10% of data or less. You can change the parameter easily at the `randint` function.

In [None]:
assert len(X_train_raw)==len(y_train_raw), "Length error"
size=len(X_train_raw)

rows=np.random.randint(10,size=size).astype('bool')
print(rows)
X_train_raw=X_train_raw[~rows]
y_train_raw=y_train_raw[~rows]
print(len(X_train_raw))
print(X_train_raw.index)
print(y_train_raw.index)

# print(idx.index)
# X_train_raw=X_train_raw.loc[idx.index]
# y_train_raw=y_train_raw.loc[idx.index]
# print(X_data.index)
# print(y_data.index)


In [None]:
%%time
X_data=np.array(preprocess_data(X_train_raw))
y_data=y_train_raw.values
print(type(X_data),type(y_data))


In [None]:
def convertRawData(X,idx):
    vocabularies=set()
    for i in idx:
        for voc in X[i]:
            vocabularies.add(voc)
    print(f"{len(vocabularies)} vocabularies found")

    data=dict()
    for voc in tqdm.tqdm(vocabularies):
        data[voc]=[X[i].count(voc) for i in idx]
        assert len(data[voc])==len(idx),"Internall error"
    return pandas.DataFrame(data)

In [None]:
from classifiers.DecisionTreeClassifier import DecisionTreeClassifier
from StatisticManager import StatisticManager
from sklearn.model_selection import KFold

kf=KFold(n_splits=10,random_state=72510,shuffle=True)
kf.get_n_splits(X_data)
print(kf)
import time
for i,(train_idx,test_idx) in enumerate(kf.split(X_data)):
    print("==================================================")
    print(f"Fold {i}: {len(train_idx)}:{len(test_idx)}")
    # print(f"Train: index={train_idx}")
    # print(f"Test:  index={test_idx}")

    X_train=convertRawData(X_data,train_idx)
    y_train=pandas.DataFrame(y_data[train_idx],columns=[target_feature])

    X_test=convertRawData(X_data,test_idx)
    y_test=pandas.DataFrame(y_data[test_idx],columns=[target_feature])

    # X_train.info()
    # X_test.info()
    # y_train.info()
    # y_test.info()

    clf=DecisionTreeClassifier(depth=3,target_feature=target_feature)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)

    statistic=StatisticManager(target_feature)
    statistic.report(y_pred,y_test)
    # statistic.evaluate(clf,X_train,y_train)
    break


