In [None]:
##### import pandas as pd 
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.decomposition import TruncatedSVD
import pickle
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
# nltk.download("stopwords")
# nltk.download("punkt")
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import re
import string

from datetime import datetime
from sklearn.linear_model import LogisticRegression

from gensim.models import Word2Vec

In [40]:
# Convert the timestamps to day-month-year and real time 
def timestamp_to_date(df):
    time_arr = []
    day_arr = []
    year_arr = []
    month_arr = []
    for val in df["Time"]:
        date_time = str(datetime.fromtimestamp(val))
        date = date_time.split(" ")[0]
        time = date_time.split(" ")[1]
        year = date.split("-")[0]
        month = date.split("-")[1]
        day = date.split("-")[2]
        time_arr.append(time)
        day_arr.append(day)
        month_arr.append(month)
        year_arr.append(year)
    df["Real_Time"] = np.array(time_arr)
    df["Year"] = np.array(year_arr).astype(int)
    df["Month"] = np.array(month_arr).astype(int)
    df["Day"] = np.array(day_arr).astype(int)

    return df   

In [41]:
# Deal with text stemmer
# col: either "Text", or "Summary"
def text_stem(df, col):
    snowball_stemmer = SnowballStemmer(language='english')
    
    # Stemmed words
    stemmed_words = []    
    for value in list(df[col]):
        tokenized_article = word_tokenize(value)

        stemmed_article = ''
        for j in range(len(tokenized_article)):
            word = snowball_stemmer.stem(tokenized_article[j])
            stemmed_article += " " + word

        stemmed_words.append(stemmed_article)
        
    df[f'{col}_Stemmed'] = np.array(stemmed_words)
    return df

In [42]:
X_t = pd.read_csv("./data/X_train.csv")
X_s = pd.read_csv("./data/X_test.csv")

X_train = X_t

X_train.shape
X_train.head()

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Time,Summary,Text,Score,Helpfulness,ReviewLength,SummaryLength,product_count,sentiment_scores
0,195370,1890228583,A3VLX5Z090RQ0V,1,2,1030838400,An Unexplained Anime Review,I was very anxious to see the Uncut version of...,2.0,0.5,234,4,4,0.139
1,1632470,B00BEIYSL4,AUDXDMFM49NGY,0,1,1405036800,not great.,Movie was okay...not great.,3.0,0.0,4,2,75,0.0
2,9771,0767809335,A3LFIA97BUU5IE,3,36,983750400,Technical problem with this DVD,"Like the Dinosaur Collector's Edition DVD, thi...",1.0,0.083333,26,5,7,0.125
3,218855,6300215792,A1QZM75342ZQVQ,1,1,1394841600,Heeeeyyyyy LAAAAADEEE!!!!,"Come on, now..... this has to be, by far, the...",5.0,1.0,157,2,4,0.064
4,936225,B000B5XOZW,ANM2SCEUL3WL1,1,1,1163721600,Herzog the Great Traveler of both natural and ...,I've always been a great admirer of Herzog's o...,4.0,1.0,277,10,1,0.028


In [81]:
# Text Process Step
def process_sentence(df):
    alphanumeric = lambda x: re.sub(r"""\w*\d\w*""", ' ', x)
    punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

    # Reassign the process 
    df["Text"] = df["Text"].fillna("").map(alphanumeric).map(punc_lower)
    df["Summary"] = df["Summary"].fillna("").map(alphanumeric).map(punc_lower)
    return df

In [44]:
X_train = process_sentence(X_train)
X_train = timestamp_to_date(X_train)
X_train.head()

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Time,Summary,Text,Score,Helpfulness,ReviewLength,SummaryLength,product_count,sentiment_scores,Real_Time,Year,Month,Day
0,195370,1890228583,A3VLX5Z090RQ0V,1,2,1030838400,an unexplained anime review,i was very anxious to see the uncut version of...,2.0,0.5,234,4,4,0.139,20:00:00,2002,8,31
1,1632470,B00BEIYSL4,AUDXDMFM49NGY,0,1,1405036800,not great,movie was okay not great,3.0,0.0,4,2,75,0.0,20:00:00,2014,7,10
2,9771,0767809335,A3LFIA97BUU5IE,3,36,983750400,technical problem with this dvd,like the dinosaur collector s edition dvd thi...,1.0,0.083333,26,5,7,0.125,19:00:00,2001,3,4
3,218855,6300215792,A1QZM75342ZQVQ,1,1,1394841600,heeeeyyyyy laaaaadeee,come on now this has to be by far the...,5.0,1.0,157,2,4,0.064,20:00:00,2014,3,14
4,936225,B000B5XOZW,ANM2SCEUL3WL1,1,1,1163721600,herzog the great traveler of both natural and ...,i ve always been a great admirer of herzog s o...,4.0,1.0,277,10,1,0.028,19:00:00,2006,11,16


In [45]:
# Stemmed Text
X_train = text_stem(X_train, "Summary")
X_train = text_stem(X_train, "Text")

# X_train.shape
X_train.head()

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Time,Summary,Text,Score,Helpfulness,ReviewLength,SummaryLength,product_count,sentiment_scores,Real_Time,Year,Month,Day,Summary_Stemmed,Text_Stemmed
0,195370,1890228583,A3VLX5Z090RQ0V,1,2,1030838400,an unexplained anime review,i was very anxious to see the uncut version of...,2.0,0.5,234,4,4,0.139,20:00:00,2002,8,31,an unexplain anim review,i was veri anxious to see the uncut version o...
1,1632470,B00BEIYSL4,AUDXDMFM49NGY,0,1,1405036800,not great,movie was okay not great,3.0,0.0,4,2,75,0.0,20:00:00,2014,7,10,not great,movi was okay not great
2,9771,0767809335,A3LFIA97BUU5IE,3,36,983750400,technical problem with this dvd,like the dinosaur collector s edition dvd thi...,1.0,0.083333,26,5,7,0.125,19:00:00,2001,3,4,technic problem with this dvd,like the dinosaur collector s edit dvd this o...
3,218855,6300215792,A1QZM75342ZQVQ,1,1,1394841600,heeeeyyyyy laaaaadeee,come on now this has to be by far the...,5.0,1.0,157,2,4,0.064,20:00:00,2014,3,14,heeeeyyyyy laaaaadee,come on now this has to be by far the best je...
4,936225,B000B5XOZW,ANM2SCEUL3WL1,1,1,1163721600,herzog the great traveler of both natural and ...,i ve always been a great admirer of herzog s o...,4.0,1.0,277,10,1,0.028,19:00:00,2006,11,16,herzog the great travel of both natur and hum...,i ve alway been a great admir of herzog s oeu...


In [46]:
## Do the same for test_set
X_test = X_s
X_test = process_sentence(X_test)
X_test = timestamp_to_date(X_test)
# Stemmed Text
X_test = text_stem(X_test, "Summary")
X_test = text_stem(X_test, "Text")
# X_train.shape
X_test.head()

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Time,Summary,Text,Helpfulness,ReviewLength,SummaryLength,product_count,Score,sentiment_scores,Real_Time,Year,Month,Day,Summary_Stemmed,Text_Stemmed
0,786781,B0000VD02Y,A1UL8PS42M5DM8,1,7,1082332800,good scenery,ok the story may be a little slow and unreali...,0.142857,54,2,20,,0.102,20:00:00,2004,4,18,good sceneri,ok the stori may be a littl slow and unrealis...
1,17153,0767823931,A2OP1HD9RGX5OW,3,6,1055376000,reminded me of my childhood,wonderful film of julia taking her kids from ...,0.5,266,5,6,,0.046,20:00:00,2003,6,11,remind me of my childhood,wonder film of julia take her kid from englan...
2,1557328,B008JFUNTG,AY113687D8YK1,1,8,1377388800,hodgepodge concepts taken from four greek myths,this movie wasn t as interesting as the first ...,0.125,134,7,22,,0.071,20:00:00,2013,8,24,hodgepodg concept taken from four greek myth,this movi wasn t as interest as the first one...
3,1242666,B001UWOLQG,A2MVTAEGBP08RB,0,1,1374710400,good suspense,this series is about suspense it is well wri...,0.0,41,2,12,,0.0,20:00:00,2013,7,24,good suspens,this seri is about suspens it is well written...
4,1359242,B003QS0E54,ALGAE0IGE4DBP,99,103,1276646400,finally an intelligent idea,the first seasons of hunter were released a ...,0.961165,279,4,6,,0.065,20:00:00,2010,6,15,final an intellig idea,the first season of hunter were releas a few ...


In [None]:
X_train.to_csv("./data/X_train_stemmed.csv")
X_test.to_csv("./data/X_test_stemmed.csv")