In [23]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
#CountVectorizer

In [2]:
df = pd.read_csv('../data/Answers.csv', encoding='latin')

In [3]:
df_answers = df.head(10_000)

In [4]:
vectorizer = TfidfVectorizer(max_features=1000)

In [5]:
X = vectorizer.fit_transform(df_answers['Body'])
y = df_answers['Score']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
  X,
  y,
  test_size=0.20,
  random_state=42,
)

In [7]:
%%time
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
print('100 - 466 ms')
print('500 - 3.26 s')
print('1000 - 7.54 s')
print('1500 - 13.8 s')
print('2000 - 23.1 s')

100 - 466 ms
500 - 3.26 s
1000 - 7.54 s
1500 - 13.8 s
2000 - 23.1 s
CPU times: user 6min 34s, sys: 31 ms, total: 6min 34s
Wall time: 6min 36s


In [8]:
model.predict(vectorizer.transform(["shitty answer"]))

array([1.86])

In [9]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

In [10]:
print(mean_squared_error(y_train, y_pred_train))
print(mean_squared_error(y_test, y_pred_test))


2509.5502678579237
17180.002217353984


In [20]:
df_lemm = pd.read_csv('../data/lemmatized_answers.csv')
df_lemm = df_lemm.dropna().head(10_000)

vectorizer = TfidfVectorizer(max_features=1000)

X = vectorizer.fit_transform(df_lemm['Body'])
y = df_lemm['Score']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20, random_state=42)

In [21]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

print('Randomforest with preprocessing')
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print(mean_squared_error(y_train, y_pred_train))
print(mean_squared_error(y_test, y_pred_test))

Randomforest with preprocessing
3174.4421556684792
13891.677465291801


In [26]:
all_models = [XGBRegressor, ExtraTreesRegressor, GradientBoostingRegressor, DecisionTreeRegressor]

for regression_model in all_models:
    print('{} with preprocessing'.format(regression_model.__name__))

    model = regression_model(random_state=42)
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    print(mean_squared_error(y_train, y_pred_train))
    print(mean_squared_error(y_test, y_pred_test))
    print('=' * 50)

XGBRegressor with preprocessing
667.1144635955592
13642.749371961349
ExtraTreesRegressor with preprocessing


KeyboardInterrupt: 

In [29]:
df_lemm = pd.read_csv('../data/lemmatized_answers.csv')
df_lemm = df_lemm.dropna()

vectorizer = TfidfVectorizer(max_features=1000)

X = vectorizer.fit_transform(df_lemm['Body'])
y = df_lemm['Score']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20, random_state=42)

In [30]:
# XGBRegressor all data & default params
model = XGBRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print(mean_squared_error(y_train, y_pred_train))
print(mean_squared_error(y_test, y_pred_test))

163.57058876486929
571.6678598876796


In [37]:
# XGBRegressor grid search
from sklearn.model_selection import GridSearchCV
parameters = {
    'max_depth': [3, 5, 7],
    'n_estimators': [20, 50, 100, 500],
    'random_state': [42]
}


df_lemm = pd.read_csv('../data/lemmatized_answers.csv').dropna().head(10_000)
X = vectorizer.fit_transform(df_lemm['Body'])
y = df_lemm['Score']

xgb_grid = GridSearchCV(XGBRegressor(), parameters, n_jobs=1, verbose=True, scoring='neg_mean_squared_error')
xgb_grid.fit(X, y)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
-29331.031705271358
{'max_depth': 3, 'n_estimators': 20, 'random_state': 42}


In [38]:
# XGBRegressor all data & best params

df_lemm = pd.read_csv('../data/lemmatized_answers.csv')
df_lemm = df_lemm.dropna()

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df_lemm['Body'])
y = df_lemm['Score']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20, random_state=42)
model = XGBRegressor(**xgb_grid.best_params_)
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print(mean_squared_error(y_train, y_pred_train))
print(mean_squared_error(y_test, y_pred_test))

313.79014473163744
562.1178368089464
