# IMPORT PACKAGE

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dsa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dsa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# IMPORT DATA

In [2]:
# Load the dataset
trainingSet = pd.read_csv("./data/X_train.csv")
submissionSet = pd.read_csv("./data/X_test.csv")

# DATA PROCESSING

In [3]:
def process(trainingSet,submissionSet,col = 'Text'):
    training_helpful = trainingSet[(trainingSet['HelpfulnessNumerator']<=trainingSet['HelpfulnessDenominator'])]
    training_drop = training_helpful.dropna()
    print("train set after cleaning wrong in helpfulness:   " , trainingSet.shape)
    print("train set after drop NaN:   ",training_drop.shape)

    X_train, X_test, Y_train, Y_test = train_test_split(
        training_drop.drop(['Score'], axis=1),
        training_drop['Score'],
        test_size=1/4.0,
        random_state=0
    )


    if col == 'Text':
        drop_col = ['Id', 'ProductId', 'UserId', 'Summary', 'Time']
    elif col == 'Summary':
        drop_col = ['Id', 'ProductId', 'UserId', 'Text', 'Time']

    X_train_processed = X_train.drop(columns = drop_col)
    X_test_processed = X_test.drop(columns = drop_col)
    submission_processed = submissionSet.drop(columns = drop_col)
    print("train set shape:  ",X_train_processed.shape,"test set shape:  ",X_test_processed.shape)
    
    return X_train_processed,X_test_processed,Y_train,Y_test,submission_processed

In [4]:
#clean the text
#https://stackoverflow.com/questions/5843518/remove-all-special-characters-punctuation-and-spaces-from-string
#https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
def remove_char(x):
    special = '[^A-Za-z0-9 ]+'
    x = re.sub(special,'',x)
    x = x.strip()
    x = x.lower()
    return x


def clean_word(dataset,col):
    stop_words = set(stopwords.words('english'))
    test = dataset[col].apply(lambda row: remove_char(str(row))).apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    return test

In [7]:
# stemming test
#https://stackoverflow.com/questions/37443138/python-stemming-with-pandas-dataframe
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
def stem(df):
    df = df.str.split()
    df = df.apply(lambda x: [stemmer.stem(y) for y in x])
    df = df.apply(lambda x: ','.join(map(str, x)))
    return df


In [12]:
#SEPERATE THE DATASET text
X_train_text,X_test_text,Y_train,Y_test,submission_text = process(trainingSet,submissionSet,'Text')

train set after cleaning wrong in helpfulness:    (1397533, 10)
train set after drop NaN:    (1397455, 10)
train set shape:   (1048091, 4) test set shape:   (349364, 4)


In [13]:
# clean the text 
X_train_text = clean_word(X_train_text,'Text')
X_test_text = clean_word(X_test_text,'Text')
submission_text = clean_word(submission_text,'Text')
X_train_text.head()

214099    big fan miami vice enjoyed movie mainly enjoy ...
544327    intimate fascinating biographic documentary gr...
500542    movie really great way movie designed looked l...
493726    tried watching tv missed several episodes enjo...
579893    ok last movies shelf would reccomend someone t...
Name: Text, dtype: object

In [14]:
# apply stem to text
X_train_text_stem = stem(X_train_text)
X_test_text_stem = stem(X_test_text)
X_submission_text_stem = stem(submission_text)
X_train_text_stem.head()

214099    big,fan,miami,vice,enjoy,movi,main,enjoy,style...
544327    intim,fascin,biograph,documentari,great,ballet...
500542    movi,realli,great,way,movi,design,look,like,se...
493726    tri,watch,tv,miss,sever,episod,enjoy,abl,watch...
579893    ok,last,movi,shelf,would,reccomend,someon,type...
Name: Text, dtype: object

In [17]:
#SEPERATE THE DATASET summary
X_train_Summary,X_test_Summary,Y_train,Y_test,submission_Summary = process(trainingSet,submissionSet,'Summary')

train set after cleaning wrong in helpfulness:    (1397533, 10)
train set after drop NaN:    (1397455, 10)
train set shape:   (1048091, 4) test set shape:   (349364, 4)


In [18]:
# clean the summary
X_train_Summary = clean_word(X_train_Summary,'Summary')
X_test_Summary = clean_word(X_test_Summary,'Summary')
submission_Summary = clean_word(submission_Summary,'Summary')
X_train_Summary.head()

214099                 miami vice meets silence lambs
544327    intimate fascinating biographic documentary
500542                                          great
493726                                    24 season 1
579893                                       response
Name: Summary, dtype: object

In [19]:
# apply stem to summary
X_train_Summary_stem = stem(X_train_Summary)
X_test_Summary_stem = stem(X_test_Summary)
X_submission_Summary_stem = stem(submission_Summary)
X_train_Summary_stem.head()

214099          miami,vice,meet,silenc,lamb
544327    intim,fascin,biograph,documentari
500542                                great
493726                          24,season,1
579893                              respons
Name: Summary, dtype: object

In [20]:
print("shape of text train set:",X_train_text.shape)
print("shape of text train stem set:",X_train_text_stem.shape)
print("shape of summary train set:",X_train_Summary.shape)
print("shape of summary train stem set:",X_train_Summary_stem.shape)

shape of text train set: (1048091,)
shape of text train stem set: (1048091,)
shape of summary train set: (1048091,)
shape of summary train stem set: (1048091,)


# VECTORIZING THE DATA

In [21]:
#countVector text
text_vectorizer = CountVectorizer()
X_train_text_vector = text_vectorizer.fit_transform(X_train_text)
X_test_text_vector = text_vectorizer.transform(X_test_text)
submission_text_vector = text_vectorizer.transform(submission_text)

In [22]:
#tfidf text
text_tfidf = TfidfVectorizer()
X_train_text_tfidf = text_tfidf.fit_transform(X_train_text)
X_test_text_tfidf = text_tfidf.transform(X_test_text)
submission_text_tfidf = text_tfidf.transform(submission_text)

In [29]:
#countVector text stem
text_stem_vectorizer = CountVectorizer()
X_train_text_stem_vector = text_stem_vectorizer.fit_transform(X_train_text_stem)
X_test_text_stem_vector = text_stem_vectorizer.transform(X_test_text_stem)
X_submission_text_stem_vector = text_stem_vectorizer.transform(X_submission_text_stem)

In [30]:
#tfidf text stem
text_stem_tfidf = TfidfVectorizer()
X_train_text_stem_tfidf = text_stem_tfidf.fit_transform(X_train_text_stem)
X_test_text_stem_tfidf = text_stem_tfidf.transform(X_test_text_stem)
submission_text_stem_tfidf = text_stem_tfidf.transform(X_submission_text_stem)

In [31]:
#countVector summary
summary_vectorizer = CountVectorizer()
X_train_summary_vector = summary_vectorizer.fit_transform(X_train_Summary)
X_test_summary_vector = summary_vectorizer.transform(X_test_Summary)
submission_summary_vector = summary_vectorizer.transform(submission_Summary)

In [32]:
#tfidf summary
summary_tfidf = TfidfVectorizer()
X_train_summary_tfidf = summary_tfidf.fit_transform(X_train_Summary)
X_test_summary_tfidf = summary_tfidf.transform(X_test_Summary)
submission_summary_tfidf = summary_tfidf.transform(submission_Summary)

In [33]:
#countVector summary stem
summary_stem_vectorizer = CountVectorizer()
X_train_summary_stem_vector = summary_stem_vectorizer.fit_transform(X_train_Summary_stem)
X_test_summary_stem_vector = summary_stem_vectorizer.transform(X_test_Summary_stem)
X_submission_summary_stem_vector = summary_stem_vectorizer.transform(X_submission_Summary_stem)

In [52]:
#tfidf text stem
summary_stem_tfidf = TfidfVectorizer()
X_train_summary_stem_tfidf = summary_stem_tfidf.fit_transform(X_train_Summary_stem)
X_test_summary_stem_tfidf = summary_stem_tfidf.transform(X_test_Summary_stem)
submission_summary_stem_tfidf = summary_stem_tfidf.transform(X_submission_Summary_stem)

# MODEL TRAINING & evaluation
### TRAIN THE MODEL
### https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/

In [36]:
NB_text_vector = MultinomialNB()
NB_text_vector.fit(X_train_text_vector,Y_train)
Y_test_vector_predictions = NB_text_vector.predict(X_test_text_vector)
print("RMSE on text CountVector testing set = ", mean_squared_error(Y_test, Y_test_vector_predictions))

RMSE on text CountVector testing set =  1.2909944928498642


In [46]:
NB_text_tfidf = MultinomialNB()
NB_text_tfidf.fit(X_train_text_tfidf,Y_train)
Y_text_tfidf_predictions = NB_text_tfidf.predict(X_test_text_vector)
print("RMSE on text TFIDF testing set = ", mean_squared_error(Y_test, Y_text_tfidf_predictions))

RMSE on text TFIDF testing set =  2.209271705155654


In [40]:
NB_text_stem_vector = MultinomialNB()
NB_text_stem_vector.fit(X_train_text_stem_vector,Y_train)
Y_test_stem_vector_predictions = NB_text_stem_vector.predict(X_test_text_stem_vector)
print("RMSE on text stem CountVector testing set = ", mean_squared_error(Y_test, Y_test_stem_vector_predictions))

RMSE on text stem CountVector testing set =  1.322563286429054


In [50]:
NB_text_stem_tfidf = MultinomialNB()
NB_text_stem_tfidf.fit(X_train_text_stem_tfidf,Y_train)
Y_text_stem_tfidf_predictions = NB_text_stem_tfidf.predict(X_test_text_stem_tfidf)
print("RMSE on text stem TFIDF testing set = ", mean_squared_error(Y_test, Y_text_stem_tfidf_predictions))

RMSE on text stem TFIDF testing set =  2.210284974983112


In [47]:
NB_summary_vector = MultinomialNB()
NB_summary_vector.fit(X_train_summary_vector,Y_train)
Y_summary_vector_predictions = NB_summary_vector.predict(X_test_summary_vector)
print("RMSE on summary CountVector testing set = ", mean_squared_error(Y_test, Y_summary_vector_predictions))

RMSE on summary CountVector testing set =  1.3820399354255162


In [49]:
NB_summary_tfidf = MultinomialNB()
NB_summary_tfidf.fit(X_train_summary_tfidf,Y_train)
Y_summary_tfidf_predictions = NB_summary_tfidf.predict(X_test_summary_vector)
print("RMSE on summary TFIDF testing set = ", mean_squared_error(Y_test, Y_summary_tfidf_predictions))

RMSE on summary TFIDF testing set =  1.45298027272415


In [51]:
NB_summary_stem_vector = MultinomialNB()
NB_summary_stem_vector.fit(X_train_summary_stem_vector,Y_train)
Y_summary_stem_vector_predictions = NB_summary_stem_vector.predict(X_test_summary_stem_vector)
print("RMSE on summary stem CountVector testing set = ", mean_squared_error(Y_test, Y_summary_stem_vector_predictions))

RMSE on summary stem CountVector testing set =  1.4092808646569195


In [54]:
NB_summary_stem_tfidf = MultinomialNB()
NB_summary_stem_tfidf.fit(X_train_summary_stem_tfidf,Y_train)
Y_summary_stem_tfidf_predictions = NB_summary_stem_tfidf.predict(X_test_summary_stem_tfidf)
print("RMSE on summary stem TFIDF testing set = ", mean_squared_error(Y_test, Y_summary_stem_tfidf_predictions))

RMSE on summary stem TFIDF testing set =  1.6301422012571416


# CREATE SUBMISSION SET

In [93]:
# submission_predict = submissionSet
# submission_predict.head()
# submission_predict['Score'] = NB_text_vector.predict(submission_text_vector)
# submission_output = submission_predict[['Id','Score']]
# submission_output.to_csv("./data/stem&vector_submission.csv",index = False)