In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as sklm
import xgboost as xgb
import matplotlib
import lightgbm as lgb
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [2]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
submissions = pd.read_csv('SampleSubmission.csv')

In [3]:
print(train.shape)
print(test.shape)

(10001, 4)
(5177, 2)


In [4]:
test = test.fillna('It was a good movie')
test.isna().sum()

tweet_id     0
safe_text    0
dtype: int64

In [10]:
y = np.array(train['agreement'])

In [11]:
combined = list(train.safe_text.values) + list(test.safe_text.values)
len(combined)

15178

In [12]:
x_train = np.array(combined[:10001])
print(len(x_train))
x_test = np.array(combined[10001:])
print(len(x_test))
x_test[:2]

10001
5177


array(['<user> <user> ... &amp; 4 a vaccine given 2 healthy peeps, FDA think just not worth the AE risk unfortunately.',
       'Students starting school without whooping cough vaccinations <url> #scpick'],
      dtype='<U152')

In [13]:
import numpy.random as nr
import sklearn.model_selection as ms
## Randomly sample cases to create independent training and test data
nr.seed(9988)
indx = range(np.array(x_train.shape[0]))
indx = ms.train_test_split(indx, test_size = 0.2)
x_train1 = x_train[indx[0]]
y_train1 = np.ravel(y[indx[0]])
x_test1 = x_train[indx[1]]
y_test1 = np.ravel(y[indx[1]])

In [14]:
print(x_train1.shape)
print(y_train1.shape)
print(x_test1.shape)
print(y_test1.shape)

(8000,)
(8000,)
(2001,)
(2001,)


In [15]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1,1), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

tfv.fit(x_train)
x_train_tfv =  tfv.transform(x_train1)
x_test_tfv = tfv.transform(x_test1)


test_enc = tfv.transform(test.safe_text.values)

In [16]:
print(x_train_tfv.shape)
print(y_train1.shape)
print(x_test_tfv.shape)

(8000, 4047)
(8000,)
(2001, 4047)


In [17]:
train.agreement.value_counts()

1.000000    5868
0.666667    3894
0.333333     239
Name: agreement, dtype: int64

In [18]:
lg = lgb.LGBMRegressor(boosting_type='gbdt', learning_rate=0.01, n_estimators=3000, max_depth=50)
eval_set = [(x_test_tfv, y_test1)]
lg.fit(x_train_tfv, y_train1, eval_set=eval_set, early_stopping_rounds=50)

[1]	valid_0's l2: 0.0319273
Training until validation scores don't improve for 50 rounds
[2]	valid_0's l2: 0.031877
[3]	valid_0's l2: 0.0318279
[4]	valid_0's l2: 0.0317809
[5]	valid_0's l2: 0.0317341
[6]	valid_0's l2: 0.0316878
[7]	valid_0's l2: 0.0316419
[8]	valid_0's l2: 0.0315971
[9]	valid_0's l2: 0.0315545
[10]	valid_0's l2: 0.0315118
[11]	valid_0's l2: 0.0314716
[12]	valid_0's l2: 0.0314337
[13]	valid_0's l2: 0.0313946
[14]	valid_0's l2: 0.0313587
[15]	valid_0's l2: 0.0313213
[16]	valid_0's l2: 0.0312858
[17]	valid_0's l2: 0.0312508
[18]	valid_0's l2: 0.0312146
[19]	valid_0's l2: 0.0311819
[20]	valid_0's l2: 0.0311502
[21]	valid_0's l2: 0.0311172
[22]	valid_0's l2: 0.0310863
[23]	valid_0's l2: 0.0310558
[24]	valid_0's l2: 0.0310257
[25]	valid_0's l2: 0.030995
[26]	valid_0's l2: 0.030966
[27]	valid_0's l2: 0.0309376
[28]	valid_0's l2: 0.0309083
[29]	valid_0's l2: 0.0308789
[30]	valid_0's l2: 0.0308524
[31]	valid_0's l2: 0.0308231
[32]	valid_0's l2: 0.0307975
[33]	valid_0's l2: 0.03

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.01, max_depth=50,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=3000, n_jobs=-1, num_leaves=31, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [19]:
predictions = lg.predict(x_test_tfv)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test1, predictions) ** 0.5

0.17261433479048996

In [24]:
test_enc = tfv.transform(x_test)
pred = lg.predict(test_enc)
test['agreement'] = pred
test.to_csv('test1.csv', index = False)

In [26]:
tfv.fit(train.safe_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=1,
        stop_words='english', strip_accents='unicode', sublinear_tf=1,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=1,
        vocabulary=None)

In [27]:
tfv.fit(train.agreement)

AttributeError: 'float' object has no attribute 'lower'