In [5]:
# Import 
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from textblob import TextBlob
from sklearn.model_selection import GridSearchCV

In [6]:
# Load data
trainingSet = pd.read_csv("train.csv")
testingSet = pd.read_csv("test.csv")
print("finish loading")

finish loading


In [7]:
# Lemmatization text by removing stopwords 
def lemmatization(text):
    stopWord = stopwords.words('english')
    # init Lemmatizer
    lemmatizer = WordNetLemmatizer()
    # remove non charaters
    text = re.sub(r'[^a-zA-Z]', ' ', str(text))
    # change all charaters to lower case
    words = text.lower().split()
    # run lemmatizer
    words = [lemmatizer.lemmatize(w) for w in words if w not in stopWord]
    return ' '.join(words)


In [8]:
# Adding new column of clean_review
trainingSet['clean_review'] = [lemmatization(text) for text in trainingSet.Text]
print("finsih lemmatization, see detail below:")

finsih lemmatization, see detail below:


In [9]:
print(trainingSet)

              Id   ProductId          UserId  HelpfulnessNumerator  \
0              0  0005019281   ADZPIG9QOCDG5                     0   
1              1  0005019281  A35947ZP82G7JH                     0   
2              2  0005019281  A3UORV8A9D5L2E                     0   
3              3  0005019281  A1VKW06X1O2X7V                     0   
4              4  0005019281  A3R27T4HADWFFJ                     0   
...          ...         ...             ...                   ...   
1697528  1697528  B00LT1JHLW   AV657BUYHHXZ2                     1   
1697529  1697529  B00LT1JHLW  A17W587EH23J0Q                    32   
1697530  1697530  B00LT1JHLW  A3DE438TF1A958                     3   
1697531  1697531  B00LT1JHLW  A2RWCXDMANY0LW                     0   
1697532  1697532  B00LT1JHLW  A3ROPC55BE2OM9                    11   

         HelpfulnessDenominator  Score        Time  \
0                             0    4.0  1203984000   
1                             0    3.0  1388361600 

In [None]:
# Calculate sentiment score
# Objective if close to 0
def polarity(text):
    textblob = TextBlob(text)
    return round(textblob.polarity,3)   

In [None]:
# Adding new column of clean_review
trainingSet['polarity'] = [polarity(text) for text in trainingSet.lemma_review]
print("finish calculate polarity, see detail below:")

In [None]:
print(trainingSet)

In [10]:
#create x_text and x_train
X_test = pd.merge(trainingSet, testingSet, left_on='Id', right_on='Id')


In [11]:
print(X_test)

             Id   ProductId          UserId  HelpfulnessNumerator  \
0             5  0005019281  A2L0G56BNOTX6S                     0   
1            11  0005019281  A33EWPXESP9GQH                     0   
2            17  0005019281  A13KAQO9F5X0FN                     0   
3            46  0005019281  A306NASGVUDFKF                    10   
4            47  0005019281  A38G1NN5SD81GD                     0   
...         ...         ...             ...                   ...   
299995  1697520  B00LH9ROKM   AYB0IXBPBJ20A                     0   
299996  1697522  B00LT1JHLW   AU73NIGESSIRE                    25   
299997  1697524  B00LT1JHLW  A3PPYOJBMFBP6U                     3   
299998  1697527  B00LT1JHLW  A2CA2Q6JS6CQAE                    10   
299999  1697528  B00LT1JHLW   AV657BUYHHXZ2                     1   

        HelpfulnessDenominator  Score_x        Time  \
0                            0      NaN  1383696000   
1                            0      NaN  1390780800   
2     

In [12]:
# Code provided in generate-Xtrain-Xsubmission.py
X_test = X_test.drop(columns=['Score_x'])
X_test = X_test.rename(columns={'Score_y': 'Score'})

X_test.to_csv("X_submission.csv", index=False)

X_train = trainingSet[trainingSet['Score'].notnull()]


X_train.to_csv("X_train.csv", index=False)
print("finish X_train and X_submission split")

finish X_train and X_submission split


In [13]:
print(X_train)

              Id   ProductId          UserId  HelpfulnessNumerator  \
0              0  0005019281   ADZPIG9QOCDG5                     0   
1              1  0005019281  A35947ZP82G7JH                     0   
2              2  0005019281  A3UORV8A9D5L2E                     0   
3              3  0005019281  A1VKW06X1O2X7V                     0   
4              4  0005019281  A3R27T4HADWFFJ                     0   
...          ...         ...             ...                   ...   
1697526  1697526  B00LT1JHLW  A22OB0DIJ5FO0G                     2   
1697529  1697529  B00LT1JHLW  A17W587EH23J0Q                    32   
1697530  1697530  B00LT1JHLW  A3DE438TF1A958                     3   
1697531  1697531  B00LT1JHLW  A2RWCXDMANY0LW                     0   
1697532  1697532  B00LT1JHLW  A3ROPC55BE2OM9                    11   

         HelpfulnessDenominator  Score        Time  \
0                             0    4.0  1203984000   
1                             0    3.0  1388361600 

In [14]:
# Read from file
X_train = pd.read_csv("X_train.csv")
X_submission = pd.read_csv("X_submission.csv")

# Split training set into training and testing set
# Code provided in predict-knn.py
X_train, X_test, Y_train, Y_test = train_test_split(
        X_train.drop(['Score'], axis=1),
        X_train['Score'],
        test_size=1/4.0,
        random_state=0
    )

In [15]:
X_train['clean_review'] = X_train['clean_review'].fillna("n")
X_test['clean_review'] = X_test['clean_review'].fillna("n")
X_submission['clean_review'] = X_submission['clean_review'].fillna("n")

In [16]:
print(X_train['clean_review'].isnull().sum())
print(X_test['clean_review'].isnull().sum())
print(X_submission['clean_review'].isnull().sum())

0
0
0


In [None]:
# Method 1: Vectorlize clean_review only 
# inti by setting feature number
vectorizer = CountVectorizer(max_features = 5000) 

# Vectorlize X_train, X_test and X_Submission
vector_train = vectorizer.fit_transform(X_train.clean_review).toarray()
vector_test = vectorizer.transform(X_test.clean_review).toarray()
vector_submit = vectorizer.transform(X_submission.clean_review).toarray()


In [17]:
# Process the DataFrames
# This is where you can do more feature extraction
X_train_processed = X_train.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary', 'clean_review'])
X_test_processed = X_test.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary','clean_review'])
X_submission_processed = X_submission.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary', 'Score','clean_review'])


In [None]:
# Learn the review model
# Create random forest model
forestor = RandomForestClassifier(n_estimators = 100,n_jobs=8)
# Set Grid Search parameters
n_estimators = [100]
min_samples_split = [2]
min_samples_leaf = [1]
bootstrap = [True]
parameters = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf,'min_samples_split': min_samples_split}
# Apply Grid search 
clf = GridSearchCV(forestor, param_grid=parameters)
# Fit data to creat model
model_vector = clf.fit(vector_train, Y_train)
print("finish model_review")


In [18]:
# Learn the review model
#create random forest model
forestor2 = RandomForestClassifier(n_estimators = 100,n_jobs=8)
# Set Grid Search parameters
n_estimators = [100]
min_samples_split = [2]
min_samples_leaf = [1]
bootstrap = [True]
parameters = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf,'min_samples_split': min_samples_split}
# Apply Grid search 
clf2 = GridSearchCV(forestor2, param_grid=parameters)
# Fit data to creat model
model = clf2.fit(X_train_processed, Y_train)
print("finish model")

finish model


In [None]:
# Predict using vector
Y_test1 = model_vector.predict(vector_test)
X_submission['Score1'] = model_vector.predict(vector_submit)

# Predict using regular pd
Y_test2 = model.predict(X_test_processed)
X_submission['Score2'] = model.predict(X_submission_processed)

# Combine two methods together with different weight
Y_av = (Y_test1 * 0.7 + Y_test2 * 0.3).round(0)
X_submission['Score']=(X_submission['Score1']*0.7+X_submission['Score2']*0.3).round(0)


# Evaluate your model on the testing set
print("RMSE on testing set: review predict = ", mean_squared_error(Y_test, Y_test1))
print("RMSE on testing set: regular predict = ", mean_squared_error(Y_test, Y_test2))
print("RMSE on testing set: regular predict = ", mean_squared_error(Y_test, Y_av))

# Create the submission file
submission = X_submission[['Id', 'Score1']]
submission.to_csv("submission1.csv", index=False)

submission = X_submission[['Id', 'Score2']]
submission.to_csv("submission2.csv", index=False)

submission = X_submission[['Id', 'Score']]
submission.to_csv("submission.csv", index=False)

In [None]:
Y_av4 = (Y_test_review_pre * 0.6 + Y_test_regular_pre * 0.4).round(0)
X_submission['Score']=(X_submission['Score1']*0.6+X_submission['Score2']*0.4).round(0)

print("RMSE on testing set: regular predict = ", mean_squared_error(Y_test, Y_av4))

submission = X_submission[['Id', 'Score']]
submission.to_csv("submission.csv", index=False)

In [None]:
for i in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    Y_av4 = (Y_test_review_pre * i + Y_test_regular_pre * (1-i)).round(0)
    X_submission['Score']=(X_submission['Score1']*i+X_submission['Score2']*(1-i)).round(0)

    print("RMSE on testing set: regular predict = ", mean_squared_error(Y_test, Y_av4))

    submission = X_submission[['Id', 'Score']]
    submission.to_csv("submission.csv", index=False)

In [22]:
# Predict using regular pd
Y_test2 = model.predict(X_test_processed)
X_submission['Score'] = model.predict(X_submission_processed)
print("RMSE on testing set: regular predict = ", mean_squared_error(Y_test, Y_test2))

RMSE on testing set: regular predict =  1.820761683419962


In [23]:
submission = X_submission[['Id', 'Score']]
submission.to_csv("submission.csv", index=False)
