In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

import string
import collections

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
TRAIN_PATH  = '/content/gdrive/My Drive/Colab Notebooks/X_train.csv'
SUBMISSION_PATH =  '/content/gdrive/My Drive/Colab Notebooks/X_submission.csv'
X_train_origin = pd.read_csv(TRAIN_PATH)
X_submission = pd.read_csv(SUBMISSION_PATH)

In [4]:
X_temp = X_train_origin['Score'].to_list()
counter = dict(collections.Counter(X_temp))
print(counter)
print(counter.values())
total = sum(list(counter.values()))
print(total)
for i in counter:
  counter[i] = counter[i] / total
print(counter)

{4.0: 315587, 3.0: 165727, 5.0: 746520, 2.0: 84084, 1.0: 85615}
dict_values([315587, 165727, 746520, 84084, 85615])
1397533
{4.0: 0.2258172078941964, 3.0: 0.11858539297462027, 5.0: 0.5341698550231014, 2.0: 0.06016602112436701, 1.0: 0.06126152298371487}


In [24]:
# further preprocess

def get_max_wd(ws_array_normalized, words_list, s):
  if type(s) == float: # nan
    return 5
  temp = np.array([0.0, 0.0, 0.0, 0.0, 0.0])
  find = False # if any word is in the vocabulary
  s = s.lower()
  exclude = set(string.punctuation)
  s = ''.join(ch for ch in s if ch not in exclude)
  words = s.split(' ')
  for word in words:
    if word in words_list:
      find = True
      temp = temp + ws_array_normalized[words_list.index(word)]
  if not find:
    return 5
  temp_list = temp.tolist()
  return temp_list.index(max(temp_list)) + 1

def get_avg_score(x, dic, most_score):
  if x in dic:
    return dic[x]
  return most_score

def preprocess(dataframe, array_normalized_summary, words_summary, array_normalized_text, words_text, film_dic, user_dic, most_score):
  # fill nan in 'text' and 'Summary'
  temp = dataframe.drop(columns = ['ProductId', 'UserId', 'Time'])
  temp['Summary'] = temp['Summary'].fillna('')
  temp['Text'] = temp['Text'].fillna('')

  # transform user id and product id
  temp['film_avg'] = dataframe['ProductId'].map(lambda x: get_avg_score(x, film_dic, most_score))
  temp['user_avg'] = dataframe['UserId'].map(lambda x: get_avg_score(x, user_dic, most_score))
  
  # calculate the most likely score it would be with the average vector from word-document matrix
  temp['S'] = dataframe['Summary'].map(lambda x: get_max_wd(array_normalized_summary, words_summary, x))
  temp['T'] = dataframe['Summary'].map(lambda x: get_max_wd(array_normalized_text, words_text, x))

  return temp

In [6]:
film_scores_df = X_train_origin[['ProductId', 'Score']]
film_df = film_scores_df.groupby('ProductId').mean()
film_dic = film_df.to_dict()['Score']

#print(film_dic) 
print(len(film_dic))

50050


In [7]:
''' film_df_std = film_scores_df.groupby('ProductId').std()
film_dic_std = film_df_std.to_dict()['Score']
print(len(film_dic_std)) '''

" film_df_std = film_scores_df.groupby('ProductId').std()\nfilm_dic_std = film_df_std.to_dict()['Score']\nprint(len(film_dic_std)) "

In [8]:
users_scores_df = X_train_origin[['UserId', 'Score']]
user_df = users_scores_df.groupby('UserId').mean()
user_dic = user_df.to_dict()['Score']

#print(user_dic)
print(len(user_dic))

123958


In [9]:
''' user_df_std = users_scores_df.groupby('UserId').std()
user_dic_std = user_df_std.to_dict()['Score']
print(len(user_dic_std)) '''

" user_df_std = users_scores_df.groupby('UserId').std()\nuser_dic_std = user_df_std.to_dict()['Score']\nprint(len(user_dic_std)) "

In [10]:
# NLP
''' all_summaries = X_train_origin['Summary'].fillna('').to_list()
vectorizer = TfidfVectorizer(stop_words='english')
words = vectorizer.fit_transform(all_summaries)
word_list = vectorizer.get_feature_names()
count_list = np.asarray(words.sum(axis=0))[0]
dic = dict(zip(word_list, count_list))
the_vocab = list(dict(sorted(dic.items(), key=lambda item: item[1], reverse=True)))[:200] # 200 most common words
print(the_vocab) '''
''' with open("/content/gdrive/My Drive/Colab Notebooks/data/summary200words.txt", 'w') as f:
  f.write(' '.join(the_vocab)) '''

' with open("/content/gdrive/My Drive/Colab Notebooks/data/summary200words.txt", \'w\') as f:\n  f.write(\' \'.join(the_vocab)) '

In [11]:
with open("/content/gdrive/My Drive/Colab Notebooks/data/summary200words.txt", 'r') as f:
  the_vocab = f.read().split(' ')
print(the_vocab)

documents_summary = []
documents_text = []
for score in [1,2,3,4,5]:
    df_temp = X_train_origin[X_train_origin['Score'] == score]
    str_list_temp = df_temp['Summary'].to_list()
    str_list_temp2 = df_temp['Text'].to_list()
    str_list_temp = [s for s in str_list_temp if type(s) != float]
    str_list_temp2 = [s for s in str_list_temp2 if type(s) != float]
    document_summary = ' '.join(str_list_temp)
    document_text = ' '.join(str_list_temp2)
    documents_summary.append(document_summary)
    documents_text.append(document_text)

vectorizer2 = TfidfVectorizer(stop_words='english')
words2 = vectorizer2.fit_transform(the_vocab)
words_score = vectorizer2.transform(documents_summary)
ws_array = np.array(words_score.toarray().T)
#print(ws_array)

row_sums = ws_array.sum(axis=1)
ws_array_normalized = ws_array / row_sums[:, np.newaxis]
#print(ws_array_normalized)

['movie', 'great', 'good', 'love', 'best', 'film', 'classic', 'fun', 'dvd', 'excellent', 'funny', 'series', 'stars', 'better', 'story', 'bad', 'time', 'awesome', 'season', 'movies', 'like', 'just', 'wonderful', 'entertaining', 'loved', 'watch', 'favorite', 'action', 'really', 'family', 'comedy', 'worth', 'ok', 'review', 'interesting', 'don', 'amazing', 'horror', 'fantastic', 'old', 'ray', 'blu', 'tv', 'nice', 'cute', 'pretty', 'wow', 'enjoyable', 'beautiful', 'collection', 'little', 'flick', 'original', 'star', 'hilarious', 'new', 'boring', 'expected', 'life', 'true', 'set', 'man', 'perfect', 'way', 'christmas', 'okay', 'films', 'buy', 'masterpiece', 'thriller', 'real', 'drama', 'version', 'brilliant', 'disappointing', 'fan', 'seen', 'quot', 'kids', 'finally', 'war', 'book', 'acting', 'quality', 'entertainment', 'watching', 'liked', 'worst', 've', 'fans', 'long', 'big', 'greatest', 'workout', 'outstanding', 'sequel', 'western', 'say', 'fi', 'video', 'sci', 'different', 'cool', 'gift', 

In [12]:
# do the same on text
''' all_texts = X_train_origin['Text'].fillna('').to_list()
vectorizer3 = TfidfVectorizer(stop_words='english')
words3 = vectorizer3.fit_transform(all_texts)
word_list2 = vectorizer3.get_feature_names()
count_list2 = np.asarray(words3.sum(axis=0))[0]
dic2 = dict(zip(word_list2, count_list2))
the_vocab2 = list(dict(sorted(dic2.items(), key=lambda item: item[1], reverse=True)))[:200] # 200 most common words
print(the_vocab2) '''
''' with open("/content/gdrive/My Drive/Colab Notebooks/data/text200words.txt", 'w') as f:
  f.write(' '.join(the_vocab2)) '''

' with open("/content/gdrive/My Drive/Colab Notebooks/data/text200words.txt", \'w\') as f:\n  f.write(\' \'.join(the_vocab2)) '

In [13]:
with open("/content/gdrive/My Drive/Colab Notebooks/data/text200words.txt", 'r') as f:
  the_vocab2 = f.read().split(' ')
print(the_vocab2)

vectorizer4 = TfidfVectorizer(stop_words='english')
words4 = vectorizer4.fit_transform(the_vocab2)
words_score2 = vectorizer4.transform(documents_text)
ws_array2 = np.array(words_score2.toarray().T)

row_sums2 = ws_array2.sum(axis=1)
ws_array_normalized2 = ws_array2 / row_sums2[:, np.newaxis]

['movie', 'film', 'great', 'good', 'like', 'just', 'love', 'story', 'watch', 'dvd', 'really', 'movies', 'time', 'series', 'best', '34', 'season', 'quot', 'don', 'watching', 'better', 'did', 'funny', 'think', 'seen', 'acting', 'way', 'people', 'characters', 'action', 'little', 'make', 'fun', 'bad', 'family', 'loved', 'know', 'life', 'enjoyed', 'old', 'enjoy', 'recommend', 'films', 'watched', 'worth', 'say', 'character', 'plot', 've', 'does', 'new', 'end', 'years', 'actors', 'set', 'fan', 'didn', 'excellent', 'lot', 'want', 'times', 'scenes', 'cast', 'tv', 'man', 'original', 'thought', 'buy', 'got', 'classic', 'real', 'version', 'interesting', 'saw', 'going', 'collection', 'makes', 'favorite', 'music', 'pretty', 'long', 'quality', 'shows', 'wonderful', 'kids', 'special', 'liked', 'stars', 'bit', 'book', 'episodes', 'look', 'll', 'work', 'comedy', 'world', 'ray', 'thing', 'horror', 'things', 'star', 'young', 'big', 'bought', 'entertaining', 'right', 'scene', 'ending', 'video', 'feel', 'ep

In [25]:
X_train_origin_processed = preprocess(X_train_origin, ws_array_normalized, the_vocab, ws_array_normalized2, the_vocab2, film_dic, user_dic, 5)

In [26]:
X_sub = preprocess(X_submission, ws_array_normalized, the_vocab, ws_array_normalized2, the_vocab2, film_dic, user_dic, 5).drop(['Score'], axis=1)

In [27]:
X_train_origin_processed.to_csv("/content/gdrive/My Drive/Colab Notebooks/data/X_train_origin_processed_NLP.csv", index=False)
X_sub.to_csv("/content/gdrive/My Drive/Colab Notebooks/data/X_sub_NLP.csv", index=False)

In [28]:
X_train_origin_processed.head(30)

Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Summary,Text,film_avg,user_avg,S,T
0,0,0,0,4.0,good version of a classic,This is a charming version of the classic Dick...,4.483871,4.333333,4,2
1,1,0,0,3.0,Good but not as moving,It was good but not as emotionally moving as t...,4.483871,3.6,3,2
2,2,0,0,3.0,Winkler's Performance was ok at best!,"Don't get me wrong, Winkler is a wonderful cha...",4.483871,3.8,5,2
3,3,0,0,5.0,It's an enjoyable twist on the classic story,Henry Winkler is very good in this twist on th...,4.483871,3.6,2,5
4,4,0,0,4.0,Best Scrooge yet,This is one of the best Scrooge movies out. H...,4.483871,4.176471,5,3
5,6,1,1,5.0,A MUST-HAVE FOR ANY VIDEO CHRISTMAS COLLECTION!!,This is the American adaptation of the Charles...,4.483871,4.272727,4,4
6,7,0,0,5.0,An American Christmas Carol,Glad that this american classic came out on dv...,4.483871,4.333333,4,3
7,8,0,0,5.0,an american christmas carol,A good Christmas carol dhenry winkler one duri...,4.483871,4.430769,4,3
8,9,0,0,5.0,Fantastic!,How a bitter old man comes to know the true me...,4.483871,5.0,2,5
9,10,0,0,5.0,"Outstanding Concept, and Performances","The small historic Canadian town of Elora, wit...",4.483871,4.75,2,5
