#Predict score field by text and parent_text fields.

In [0]:
import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler


import spacy
from spacy.symbols import ORTH

spacy_en = spacy.load('en')
spacy_en.tokenizer.add_special_case("don't", [{ORTH: "do"}, {ORTH: "not"}])
spacy_en.tokenizer.add_special_case("didn't", [{ORTH: "did"}, {ORTH: "not"}])


SEED = 43
np.random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

In [0]:
from google.colab import drive
drive.mount('/content/drive')

# Data Prepatation

In [0]:
columns = ["text", "parent_text", "score"]
df = pd.concat([
    pd.read_csv("/content/drive/My Drive/jetbrains/comments_positive.csv", usecols=columns, na_filter=False),
    pd.read_csv("/content/drive/My Drive/jetbrains/comments_negative.csv", usecols=columns, na_filter=False)
], ignore_index=True)

In [0]:
y = df['score']
df.drop(columns='score', inplace=True)
X = df

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=SEED)

# To be sure we don't use indices to predict something
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

print("Train shape: {}".format(X_train.shape))
print("Test shape: {}".format(X_test.shape))

Train shape: (3800000, 2)
Test shape: (200000, 2)


In [0]:
X_train.head()

Unnamed: 0,text,parent_text
0,"I was a whole bunch of folks on Recess, yup. :...",Your IMDB just blew my mind. You were Upside D...
1,"I'm not quite sure how that was relevant, but ...",http://i.imgur.com/GKLI7.jpg
2,Sounds like you weren't paying attention.,Sadly high school history doesn't teach you ab...
3,&gt;Polygamy doesn't mean that there's just wo...,&gt; from a feminist perspective \n\nSo what i...
4,The New Orleans Gargle-Glub-Glubs.,My high school's mascot was water...


In [0]:
y_train.head()

0    454
1     71
2     -9
3    -12
4     91
Name: score, dtype: int64

## Cleaning the data

Т.к. некоторые тексты - это просто смайлики или нечитаемые бессмысленные вещи, то их было решено удалить. Также были удалены строки, если после очисти они были пустыми. Всего из тренировочных данных было удалено около 4500  строк.

In [0]:
def clean_text(text):
    text = re.sub(r'\<[^>]*\>', '', text)
    text = re.sub(r'\\n', ' ', text)
    text = re.sub(r'&[a-z]{0,7};', ' ', text)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',text)
    text = re.sub('@[^\s]+','AT_USER',text)
    text = re.sub(r'\\x\d{1,4}', '', text)
    text = re.sub(r'\\n', ' ', text)
    text = re.sub(r'\r', ' ', text)
    text = re.sub('[\s]+', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def clean_df(df, y, text_cleaner, train=True):
    """
    Apply text_cleaner to df columns and deleting empty rows.

    Args:
        df : initial DataFrame
        y : target values
        text_cleaner: function to apply to 'text' and 'parent_text' columns

    Returns:
        Pandas DataFrame with rating coumns as well
    """
    df_cleaned = pd.DataFrame()
    df_cleaned['text'] = df['text'].apply(lambda x: text_cleaner(x))
    df_cleaned['parent_text'] = df['parent_text'].apply(lambda x: text_cleaner(x))
    df_cleaned['rating'] = y    

    if train:
        replacing = {'':np.nan, 'null':np.nan, 'NaN':np.nan, 'NA':np.nan}
    else:
        replacing = {'':'empty', 'null':'empty', 'NaN':'empty', 'NA':'empty'}
    df_cleaned['text'].replace(to_replace=replacing, value=None, inplace=True)
    df_cleaned['parent_text'].replace(to_replace=replacing, value=None, inplace=True)
    df_cleaned.dropna(subset=['text', 'parent_text', 'rating'], inplace=True)
    return df_cleaned

def tokenizer(text):
    return [tok.lemma_ for tok in spacy_en.tokenizer(text) if tok.text.isalpha()]

def transform_df(df, tokenizer=tokenizer):
    """
    Adds several columns to df: 
    1) lemmatized text 
    2) lemmatized parent text 
    3) text + parent text 
    4) lemmatized text + parent text
    """
    # commented because it was not so good and took a lot of ram and time

    # df['text_lemm'] = df['text'].apply(lambda x: ' '.join(tokenizer(x)))
    # df['parent_lemm'] = df['parent_text'].apply(lambda x: ' '.join(tokenizer(x)))
    df['text_and_parent'] = df['text']+ ' ' + df['parent_text']
    # df['text_and_parent_lemm'] = df['text_lemm'] + df['parent_lemm']
    
    return df

def scale_y(df, standart=True, training=True, scaler=None):
    """
    Transforms rating score with StandardScaler or MinMaxScaler
    Returns pd.DataFrame with new column 'rating_scaled', scaler
    """
    if training:  
        scaler = StandardScaler() if standart else MinMaxScaler()     
        X = np.array(list(df['rating'])).reshape(-1,1)
        scaler.fit(X)
    df['rating_scaled'] = df['rating'].apply(lambda x : scaler.transform(np.array(x).reshape(1,-1))[0][0])

    return df, scaler

In [0]:
# cleaning df
train = clean_df(X_train, y_train, clean_text)
test = clean_df(X_test, y_test, clean_text, train=False)
train.shape, X_train.shape, test.shape, X_test.shape

((3795465, 3), (3800000, 2), (200000, 3), (200000, 2))

In [0]:
train = transform_df(train)
test = transform_df(test)
train.head()

Unnamed: 0,text,parent_text,rating,text_and_parent
0,I was a whole bunch of folks on Recess yup Th...,Your IMDB just blew my mind You were Upside Do...,454,I was a whole bunch of folks on Recess yup Th...
1,Im not quite sure how that was relevant but Im...,URL,71,Im not quite sure how that was relevant but Im...
2,Sounds like you werent paying attention,Sadly high school history doesnt teach you abo...,-9,Sounds like you werent paying attention Sadly ...
3,Polygamy doesnt mean that theres just women i...,from a feminist perspective So what if you wo...,-12,Polygamy doesnt mean that theres just women i...
4,The New Orleans GargleGlubGlubs,My high schools mascot was water,91,The New Orleans GargleGlubGlubs My high school...


In [0]:
# adding normilized rating score
train, scaler = scale_y(train)
test, scaler  = scale_y(test, training=False, scaler=scaler)
train.head()

Unnamed: 0,text,parent_text,rating,text_and_parent,rating_scaled
0,I was a whole bunch of folks on Recess yup Th...,Your IMDB just blew my mind You were Upside Do...,454,I was a whole bunch of folks on Recess yup Th...,1.709532
1,Im not quite sure how that was relevant but Im...,URL,71,Im not quite sure how that was relevant but Im...,-0.098649
2,Sounds like you werent paying attention,Sadly high school history doesnt teach you abo...,-9,Sounds like you werent paying attention Sadly ...,-0.476337
3,Polygamy doesnt mean that theres just women i...,from a feminist perspective So what if you wo...,-12,Polygamy doesnt mean that theres just women i...,-0.4905
4,The New Orleans GargleGlubGlubs,My high schools mascot was water,91,The New Orleans GargleGlubGlubs My high school...,-0.004227


Убедимся, что ничего не съехало:

In [0]:
train.iloc[-1], X_train.iloc[-1], y_train.iloc[-1]

(text               4x the pixels has nothing to do with the resol...
 parent_text        The new GPU is 2x as powerful as the iPad 2s b...
 rating                                                            -8
 text_and_parent    4x the pixels has nothing to do with the resol...
 rating_scaled                                              -0.471616
 Name: 3799999, dtype: object,
 text           4x the pixels has nothing to do with the resol...
 parent_text    The new GPU is 2x as powerful as the iPad 2's,...
 Name: 3799999, dtype: object,
 -8)

In [0]:
test.iloc[-1], X_test.iloc[-1], y_test.iloc[-1]

(text               Ha people on Reddit go on the side of whoever ...
 parent_text        I just hope you guys dont ruin it for the play...
 rating                                                            91
 text_and_parent    Ha people on Reddit go on the side of whoever ...
 rating_scaled                                            -0.00422711
 Name: 199999, dtype: object,
 text           Ha, people on Reddit go on the side of whoever...
 parent_text    I just hope you guys don't ruin it for the pla...
 Name: 199999, dtype: object,
 91)

In [0]:
train.head()

Unnamed: 0,text,parent_text,rating,text_and_parent,rating_scaled
0,I was a whole bunch of folks on Recess yup Th...,Your IMDB just blew my mind You were Upside Do...,454,I was a whole bunch of folks on Recess yup Th...,1.709532
1,Im not quite sure how that was relevant but Im...,URL,71,Im not quite sure how that was relevant but Im...,-0.098649
2,Sounds like you werent paying attention,Sadly high school history doesnt teach you abo...,-9,Sounds like you werent paying attention Sadly ...,-0.476337
3,Polygamy doesnt mean that theres just women i...,from a feminist perspective So what if you wo...,-12,Polygamy doesnt mean that theres just women i...,-0.4905
4,The New Orleans GargleGlubGlubs,My high schools mascot was water,91,The New Orleans GargleGlubGlubs My high school...,-0.004227


# Sklearn Models

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, SGDRegressor, Lasso

In [0]:
# TfidfVectorizer params were chosen mostly based on RAM limits
vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1,2), analyzer='word',
                             stop_words='english', max_features=200000)
X_train = vectorizer.fit_transform(train['text_and_parent'])
X_test = vectorizer.transform(test['text_and_parent'])
X_train.shape, X_test.shape

((3795465, 200000), (200000, 200000))

После перебора нескольких вариантов исходного текста ('text', 'text_lemm', 'text_and_parent', 'text_and_parent_lemm') было решено использовать 'text_and_parent' для построения tf-idf матрицы (так MSE минимальна для большинства моделей). 


Далее смотрим на ошибки на тренировочной и тестовой выборках и выбирем модель для кросс-валидации. 

In [0]:
def check_model(model, name, X_train, y_train, X_test, y_test, scaler=scaler):
    """
    Fits model on train data and prints MSE error information for train/test data
    """
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    print('[TRAIN] MSE for {} [SCALED Y] - {}'.format(name, mean_squared_error(y_train, y_pred_train)))
    print('[TEST] MSE for {} [SCALED Y] - {}'.format(name, mean_squared_error(y_test, y_pred)))
    print('[TRAIN] MSE for {} [ORIGINAL Y] - {}'.format(name, mean_squared_error(scaler.inverse_transform(y_train), scaler.inverse_transform(y_pred_train))))
    print('[TEST] MSE for {} [ORIGINAL Y] - {}'.format(name, mean_squared_error(scaler.inverse_transform(y_test), scaler.inverse_transform(y_pred))))
    print()

In [0]:
for model, name in [(Ridge(), 'Ridge'), (SGDRegressor(), 'SGD'), (Lasso(), 'Lasso')]:
    check_model(model, name, X_train, train['rating_scaled'], X_test, test['rating_scaled'])

[TRAIN] MSE for Ridge [SCALED Y] - 0.8292247731167245
[TEST] MSE for Ridge [SCALED Y] - 0.8928051132111298
[TRAIN] MSE for Ridge [ORIGINAL Y] - 37203.67066495778
[TEST] MSE for Ridge [ORIGINAL Y] - 40056.2410539822

[TRAIN] MSE for SGD [SCALED Y] - 0.9279282281703671
[TEST] MSE for SGD [SCALED Y] - 0.9146510793218114
[TRAIN] MSE for SGD [ORIGINAL Y] - 41632.06083654796
[TEST] MSE for SGD [ORIGINAL Y] - 41036.37352817834



Подберем гиперпараметры для Ridge регрессии:

In [21]:
model = RidgeCV(cv=3, alphas=[0.1, 1.0, 10, 100])
model.fit(X_train, train['rating_scaled'])

RidgeCV(alphas=array([  0.1,   1. ,  10. , 100. ]), cv=3, fit_intercept=True,
        gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [22]:
print('NEG MSE [TRAIN]: {}'.format(model.score(X_train, train['rating_scaled'])))
print('NEG MSE [TEST]: {}'.format(model.score(X_test, test['rating_scaled'])))

NEG MSE [TRAIN]: 0.1460142057322511
NEG MSE [TEST]: 0.1083783128002126


In [23]:
mse_train_s = mean_squared_error(train['rating_scaled'], model.predict(X_train))
mse_train = mean_squared_error(train['rating'], scaler.inverse_transform(model.predict(X_train)).astype(int))

print('MSE [TRAIN_SCALED]: {}'.format(mse_train_s))
print('MSE [TRAIN]: {}'.format(mse_train))

MSE [TRAIN_SCALED]: 0.8539857942677487
MSE [TRAIN]: 38310.67912680001


In [24]:
mse_test_s = mean_squared_error(test['rating_scaled'], model.predict(X_test))
mse_test = mean_squared_error(test['rating'], scaler.inverse_transform(model.predict(X_test)).astype(int))

print('MSE [TEST_SCALED]: {}'.format(mse_test_s))
print('MSE [TEST]: {}'.format(mse_test))

MSE [TEST_SCALED]: 0.8791710848930371
MSE [TEST]: 39439.86682


MSE на тестовых данных: 39 439. 