# IMPORT PACKAGE

In [24]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn import svm

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dsa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dsa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# IMPORT DATA

In [27]:
# Load the dataset
trainingSet = pd.read_csv("./data/X_train.csv")
submissionSet = pd.read_csv("./data/X_test.csv")

# DATA PROCESSING

In [17]:
def process(trainingSet,submissionSet,col = 'Text'):
    training_helpful = trainingSet[(trainingSet['HelpfulnessNumerator']<=trainingSet['HelpfulnessDenominator'])]
    training_drop = training_helpful.dropna()
    print("train set after cleaning wrong in helpfulness:   " , trainingSet.shape)
    print("train set after drop NaN:   ",training_drop.shape)

    X_train, X_test, Y_train, Y_test = train_test_split(
        training_drop.drop(['Score'], axis=1),
        training_drop['Score'],
        test_size=1/4.0,
        random_state=0
    )


    if col == 'Text':
        drop_col = ['Id', 'ProductId', 'UserId', 'Summary', 'Time']
    elif col == 'Summary':
        drop_col = ['Id', 'ProductId', 'UserId', 'Text', 'Time']

    X_train_processed = X_train.drop(columns = drop_col)
    X_test_processed = X_test.drop(columns = drop_col)
    submission_processed = submissionSet.drop(columns = drop_col)
    print("train set shape:  ",X_train_processed.shape,"test set shape:  ",X_test_processed.shape)
    
    return X_train_processed,X_test_processed,Y_train,Y_test,submission_processed

In [18]:
#clean the text
#https://stackoverflow.com/questions/5843518/remove-all-special-characters-punctuation-and-spaces-from-string
#https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
def remove_char(x):
    special = '[^A-Za-z ]+'
    x = re.sub(special,'',x)
    x = x.strip()
    x = x.lower()
    return x


def clean_word(dataset,col):
    stop_words = set(stopwords.words('english'))
    test = dataset[col].apply(lambda row: remove_char(str(row))).apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    return test

In [19]:
trainingSet['Text'] = trainingSet['Text'] + ' ' + trainingSet['Summary']
submissionSet['Text'] = submissionSet['Text']  + ' ' + submissionSet['Summary']


In [20]:
#SEPERATE THE DATASET 
X_train_processed,X_test_processed,Y_train,Y_test,submission_processed = process(trainingSet,submissionSet,'Text')

train set after cleaning wrong in helpfulness:    (1397533, 10)
train set after drop NaN:    (1397455, 10)
train set shape:   (1048091, 4) test set shape:   (349364, 4)


In [21]:
X_train_processed.head()

Unnamed: 0,HelpfulnessNumerator,HelpfulnessDenominator,Text,Helpfulness
214099,0,0,"As a big fan of Miami Vice, I enjoyed this mov...",0.0
544327,0,0,Intimate and fascinating biographic documentar...,0.0
500542,1,2,This movie was really not all that great. The...,0.5
493726,0,0,Tried watching this on TV but missed several e...,0.0
579893,0,0,it's ok if you had last few movies on the she...,0.0


In [22]:
# APLLY THE CLEANNING FUNCTION TO DATAFRAME
X_train_text = clean_word(X_train_processed,'Text')
X_test_text = clean_word(X_test_processed,'Text')
submission_text = clean_word(submission_processed,'Text')
X_train_text.head()

214099    big fan miami vice enjoyed movie mainly enjoy ...
544327    intimate fascinating biographic documentary gr...
500542    movie really great way movie designed looked l...
493726    tried watching tv missed several episodes enjo...
579893    ok last movies shelf would reccomend someone t...
Name: Text, dtype: object

# VECTORIZING THE DATA

In [23]:
#tfidf the data
tfidf_text = TfidfVectorizer()
X_training_tfidf_text = tfidf_text.fit_transform(X_train_text)
X_test_tfidf_text = tfidf_text.transform(X_test_text)
submission_tfidf_text = tfidf_text.transform(submission_text)
print("shape of training set TFIDF&text: ",X_training_tfidf_text.shape)

shape of training set TFIDF&text:  (1048091, 1755702)


# MODEL TRAINING

In [25]:
SVM_combine = svm.LinearSVC()
SVM_combine.fit(X_training_tfidf_text,Y_train)

LinearSVC()

# EVALUATE THE MODEL

In [26]:
Y_combine = SVM_combine.predict(X_test_tfidf_text)
print("RMSE on SVM_combine testing set = ", mean_squared_error(Y_test, Y_combine))

RMSE on SVM_combine testing set =  0.8767674975097606


# CREATE SUBMISSION SET

In [28]:
submission_predict = submissionSet
submission_predict['Score'] = SVM_combine.predict(submission_tfidf_text)
submission_output = submission_predict[['Id','Score']]
submission_output.to_csv("./data/SVM_combine_submission.csv",index = False)

# Product Id TEST

In [11]:
# Predict_output = pd.read_csv("./data/SVM_combine_submission.csv.csv")
# raw = pd.read_csv("./data/train.csv")
# predict_merge = pd.merge(Predict_output, raw, left_on='Id', right_on='Id')

In [5]:
# test = raw.loc[:,["ProductId",'Score']].groupby(by = 'ProductId').mean()
# test5 = test[test['Score']==5.0]
# test4 = test[test['Score']==4.0]
# test3 = test[test['Score']==3.0]
# test2 = test[test['Score']==2.0]
# test1 = test[test['Score']==1.0]
# print("score 5:",test5.shape)
# print("score 4:",test4.shape)
# print("score 3:",test3.shape)
# print("score 2:",test2.shape)
# print("score 1:",test1.shape)

score 5: (2200, 1)
score 4: (2689, 1)
score 3: (904, 1)
score 2: (249, 1)
score 1: (23, 1)


In [74]:
# product5 = test5.index
# product4 = test4.index
# product3 = test3.index
# product2 = test2.index
# product1 = test1.index

In [87]:
# predict_productid = predict_merge.loc[:,["Id","ProductId",'Score_x']]
# predict_productid

Unnamed: 0,Id,ProductId,Score_x
0,5,0005019281,5.0
1,11,0005019281,4.0
2,17,0005019281,4.0
3,46,0005019281,5.0
4,47,0005019281,1.0
...,...,...,...
299995,1697520,B00LH9ROKM,4.0
299996,1697522,B00LT1JHLW,5.0
299997,1697524,B00LT1JHLW,3.0
299998,1697527,B00LT1JHLW,5.0


In [88]:
# predict_productid['check5'] = predict_productid['ProductId'].isin(product5)
# predict_productid['check4'] = predict_productid['ProductId'].isin(product4)
# predict_productid['check3'] = predict_productid['ProductId'].isin(product3)
# predict_productid['check2'] = predict_productid['ProductId'].isin(product2)
# predict_productid['check1'] = predict_productid['ProductId'].isin(product1)

In [89]:
# predict_productid.loc[predict_productid.check5==True,'Score_x'] = 5.0
# predict_productid.loc[predict_productid.check4==True,'Score_x'] = 4.0
# predict_productid.loc[predict_productid.check3==True,'Score_x'] = 3.0
# predict_productid.loc[predict_productid.check2==True,'Score_x'] = 2.0
# predict_productid.loc[predict_productid.check1==True,'Score_x'] = 1.0

In [90]:
# predict_productid

Unnamed: 0,Id,ProductId,Score_x,check5,check4,check3,check2,check1
0,5,0005019281,5.0,False,False,False,False,False
1,11,0005019281,4.0,False,False,False,False,False
2,17,0005019281,4.0,False,False,False,False,False
3,46,0005019281,5.0,False,False,False,False,False
4,47,0005019281,1.0,False,False,False,False,False
...,...,...,...,...,...,...,...,...
299995,1697520,B00LH9ROKM,4.0,False,False,False,False,False
299996,1697522,B00LT1JHLW,5.0,False,False,False,False,False
299997,1697524,B00LT1JHLW,3.0,False,False,False,False,False
299998,1697527,B00LT1JHLW,5.0,False,False,False,False,False


In [93]:
# submission_product = predict_productid[['Id','Score_x']]
# submission_product = submission_product.set_axis(['Id','Score'],axis=1)

In [96]:
# submission_product.to_csv("./data/product_test.csv",index = False)