In [1]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix,f1_score
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [2]:
df = pd.read_csv('vect_word.csv')

In [3]:
df.shape

(323, 4)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,text,score
0,0,570,k want great life job money girl etc play game...,90
1,1,730,play nice game much cheater almost imposs impr...,83
2,2,271590,hour shark card invest mod exploit use ever ba...,96
3,3,440,first download game februari first valv game f...,92
4,4,252950,rocket leagu less sport game actual sport fifa...,86


In [5]:
df.shape
df.dropna(inplace = True)

In [6]:
X_train,X_test,y_train,y_test=train_test_split(df['text'],
                                              df['score'],
                                              test_size=0.25,
                                              random_state=0)

In [7]:
#set type
X_train=X_train.values.astype('U')
X_test=X_test.values.astype('U')

In [8]:
#1-gram
vect=CountVectorizer(ngram_range=(1,1),min_df=5).fit(X_train)
vect_name=vect.get_feature_names()
X_train_vectorized = vect.transform(X_train)

In [9]:
# 2-gram
vect_bigram=CountVectorizer(ngram_range=(1,2),min_df=5).fit(X_train)
X_train_bivectorized = vect_bigram.transform(X_train)

In [10]:
# 3-gram
vect_trigram=CountVectorizer(ngram_range=(1,3),min_df=5).fit(X_train)
X_train_trivectorized = vect_trigram.transform(X_train)

In [11]:
# Linear Regression
from sklearn import linear_model
from sklearn.metrics import r2_score
# 1-gram
model = linear_model.LinearRegression()
model.fit(X_train_vectorized, y_train)
score_pred = model.predict(vect.transform(X_test))
# 2-gram
model_bigram=linear_model.LinearRegression()
model_bigram.fit(X_train_bivectorized,y_train)
predictions_bigram=model_bigram.predict(vect_bigram.transform(X_test))
# 3-gram
model_trigram=linear_model.LinearRegression()
model_trigram.fit(X_train_trivectorized,y_train)
predictions_trigram=model_trigram.predict(vect_trigram.transform(X_test))

In [37]:
ls = linear_model.Lasso(alpha=0.01)
ls.fit(vect.transform(df['text'].values.astype('U'))
       , df['score'])
lll = ls.predict(vect.transform(X_test))
print(r2_score(lll,y_test))
print(mean_squared_error(lll,y_test))

  max_iter, tol, rng, random, positive)


0.9999080976585347
0.001945015930420403


In [12]:
#results for linear regression
# 1-gram
print(r2_score(score_pred,y_test))
print(mean_squared_error(score_pred,y_test))
# 2-gram
print(r2_score(predictions_bigram,y_test))
print(mean_squared_error(predictions_bigram,y_test))
# 3-gram
print(r2_score(predictions_trigram,y_test))
print(mean_squared_error(score_pred,y_test))

-0.3738671711853754
17.077917174944066
-0.42308978136500075
16.62826223202141
-0.4225145565611168
17.077917174944066


## Super Vector Regression

In [49]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

gscv = GridSearchCV(SVR(kernel='linear', gamma='scale'), cv=5,
                   param_grid={"C": range(1,20),
                               'degree':range(1,10), 'epsilon': np.arange(0,1, 0.1)})
gscv.fit(X_train_vectorized, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': range(1, 20), 'degree': range(1, 10),
                         'epsilon': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [50]:
gscv.best_params_

{'C': 19, 'degree': 1, 'epsilon': 0.9}

## kernel = linear 

In [13]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
# 1-gram
model = SVR(kernel='linear', gamma='scale', C = 19, degree = 1, epsilon = 0.9)
model.fit(X_train_vectorized, y_train)
score_pred = model.predict(vect.transform(X_test))
# 2-gram
model_bigram=SVR(kernel='linear', gamma='scale', C = 19, degree = 1, epsilon = 0.9)
model_bigram.fit(X_train_bivectorized,y_train)
predictions_bigram=model_bigram.predict(vect_bigram.transform(X_test))
# 3-gram
model_trigram=SVR(kernel='linear', gamma='scale', C = 19, degree = 1, epsilon = 0.9)
model_trigram.fit(X_train_trivectorized,y_train)
predictions_trigram=model_trigram.predict(vect_trigram.transform(X_test))

In [19]:
#results for svm regression
# 1-gram
print(r2_score(score_pred,y_test))
print(mean_squared_error(score_pred,y_test))
print(mean_absolute_error(score_pred,y_test))
# 2-gram
print(r2_score(predictions_bigram,y_test))
print(mean_squared_error(predictions_bigram,y_test))
print(mean_absolute_error(score_pred,y_test))
# 3-gram
print(r2_score(predictions_trigram,y_test))
print(mean_squared_error(score_pred,y_test))
print(mean_absolute_error(score_pred,y_test))

-26.82276900995023
21.131486286074086
3.969048231833208
[[1.         0.25561371]
 [0.25561371 1.        ]]
-27.13514305255644
21.08086639426944
3.969048231833208
-27.18949958726813
21.131486286074086
3.969048231833208


## kernel = poly

In [15]:
# 1-gram
model = SVR(kernel='poly', gamma='scale')
model.fit(X_train_vectorized, y_train)
score_pred = model.predict(vect.transform(X_test))
# 2-gram
model_bigram=SVR(kernel='poly', gamma='scale')
model_bigram.fit(X_train_bivectorized,y_train)
predictions_bigram=model_bigram.predict(vect_bigram.transform(X_test))
# 3-gram
model_trigram=SVR(kernel='poly', gamma='scale')
model_trigram.fit(X_train_trivectorized,y_train)
predictions_trigram=model_trigram.predict(vect_trigram.transform(X_test))

In [16]:
#results for svm regression
# 1-gram
print(r2_score(score_pred,y_test))
print(mean_squared_error(score_pred,y_test))
print(mean_absolute_error(score_pred,y_test))
# 2-gram
print(r2_score(predictions_bigram,y_test))
print(mean_squared_error(predictions_bigram,y_test))
print(mean_absolute_error(score_pred,y_test))
# 3-gram
print(r2_score(predictions_trigram,y_test))
print(mean_squared_error(score_pred,y_test))
print(mean_absolute_error(score_pred,y_test))

-11.176211012308881
20.925583101126062
3.883298226652802
-11.274937345982142
20.796521879396295
3.883298226652802
-11.300432482803304
20.925583101126062
3.883298226652802


## kernel = rbf

In [17]:
# 1-gram
model = SVR(kernel='rbf', gamma='scale')
model.fit(X_train_vectorized, y_train)
score_pred = model.predict(vect.transform(X_test))
# 2-gram
model_bigram=SVR(kernel='rbf', gamma='scale')
model_bigram.fit(X_train_bivectorized,y_train)
predictions_bigram=model_bigram.predict(vect_bigram.transform(X_test))
# 3-gram
model_trigram=SVR(kernel='rbf', gamma='scale')
model_trigram.fit(X_train_trivectorized,y_train)
predictions_trigram=model_trigram.predict(vect_trigram.transform(X_test))

In [18]:
#results for svm regression
# 1-gram
print(r2_score(score_pred,y_test))
print(mean_squared_error(score_pred,y_test))
print(mean_absolute_error(score_pred,y_test))
# 2-gram
print(r2_score(predictions_bigram,y_test))
print(mean_squared_error(predictions_bigram,y_test))
print(mean_absolute_error(score_pred,y_test))
# 3-gram
print(r2_score(predictions_trigram,y_test))
print(mean_squared_error(score_pred,y_test))
print(mean_absolute_error(score_pred,y_test))

-26.82276900995023
21.131486286074086
3.969048231833208
-27.13514305255644
21.08086639426944
3.969048231833208
-27.18949958726813
21.131486286074086
3.969048231833208


## Random Forest Regression

In [41]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

In [45]:
# 1-gram
regr1 = RandomForestRegressor(max_depth=10, random_state=0, 
                             n_estimators=100)
regr1.fit(X_train_vectorized, y_train)
score_pred = regr1.predict(vect.transform(X_test))
# 2-gram
regr2 = RandomForestRegressor(max_depth=10, random_state=0, 
                             n_estimators=100)
regr2.fit(X_train_bivectorized, y_train)
predictions_bigram = regr2.predict(vect_bigram.transform(X_test))
# 3-gram
regr3 = RandomForestRegressor(max_depth=10, random_state=0, 
                             n_estimators=100)
regr3.fit(X_train_trivectorized, y_train)
predictions_trigram = regr3.predict(vect_trigram.transform(X_test))

In [46]:
# 1-gram
print(r2_score(score_pred,y_test))
print(mean_squared_error(score_pred,y_test))
print(mean_absolute_error(score_pred,y_test))
# 2-gram
print(r2_score(predictions_bigram,y_test))
print(mean_squared_error(predictions_bigram,y_test))
print(mean_absolute_error(score_pred,y_test))
# 3-gram
print(r2_score(predictions_trigram,y_test))
print(mean_squared_error(score_pred,y_test))
print(mean_absolute_error(score_pred,y_test))

-4.905868688129202
17.697042016215544
3.459782265799247
-4.911816247603425
17.907333208959596
3.459782265799247
-5.176741118248769
17.697042016215544
3.459782265799247
