In [1]:
import pandas as pd
import fasttext
import re
from sklearn.model_selection import train_test_split,StratifiedKFold,KFold
from sklearn.metrics import classification_report
import ast
import numpy as np

In [2]:
df = pd.read_csv('ready_to_serve_train.csv')
df_test = pd.read_csv('ready_to_serve_dev.csv')
# df_test = df_test[~pd.isnull(df_test.text)]

In [3]:
test_lines = df_test["edited_head_line"].values

In [4]:
df.head(5)

Unnamed: 0,id,original,edit,grades,meanGrade,grade_round,grades_0,grades_1,grades_2,grades_3,grades_4,edited_head_line,original_cleaned
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2,0,1,0,0,0,0,france is hunting down its citizens who joined...,france is hunting down its citizens who joined...
1,13034,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6,2,3,3,1,1,0,"pentagon claims 2,000 % increase in russian tr...","pentagon claims 2,000 % increase in russian tr..."
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0,1,2,2,1,0,0,iceland pm calls snap vote as pedophile furor ...,iceland pm calls snap vote as pedophile furor ...
3,76,"In an apparent first , Iran and Israel <engage...",slap,20000,0.4,0,2,0,0,0,0,"in an apparent first , iran and israel slap ea...","in an apparent first , iran and israel engage ..."
4,6164,Trump was told weeks ago that Flynn misled <Vi...,school,0,0.0,0,0,0,0,0,0,trump was told weeks ago that flynn misled sch...,trump was told weeks ago that flynn misled vic...


In [5]:
df["original_plus_edited"] = df["original_cleaned"] + df["edited_head_line"]

In [6]:
def write(train,test,column,dev=False):
    if dev:        
        with open('train.txt','w',encoding='utf8') as write:
            write.writelines('\n'.join(df[column].tolist()))
    else:
        with open('train.txt','w',encoding='utf8') as write:
            write.writelines('\n'.join(df.loc[train,column].tolist()))
        with open('test.txt','w',encoding='utf8') as write:
            write.writelines('\n'.join(df.loc[test,column].tolist()))

In [7]:
def label_fasttext(data,X,y,multi_label=False):
    if multi_label:
        labels = ast.literal_eval(data[y])
        if not labels:
            return "__label__-1 " + data[X]
        else:
            label = "".join(["__label__" + str(cls) + " " for cls in labels])  + data[X]
            return label
    else:
        return "__label__" + str(data[y]) + " " + " " + data[X]
    
def get_test_predictions(test_lines,model):
    predictions = []
    for line in test_lines:
        predictions.append(int(model.predict(line)[0][0].replace("__label__","")))
    return predictions    

### For grade 0

In [8]:
df['p_class'] = df.apply(lambda x:label_fasttext(x,'original_plus_edited','grades_0'),axis=1)

In [16]:
kfold = StratifiedKFold(4, True)
for train_index, test_index in kfold.split(df,df["grades_0"]):
    write(train_index,test_index,'p_class')
    model = fasttext.train_supervised('train.txt', lr=0.01, dim=400, epoch=10,  word_ngrams=3, loss='softmax')
    print(model.test('test.txt'))

(2414, 0.37903893951946976, 0.37903893951946976)
(2413, 0.3787815996684625, 0.3787815996684625)
(2413, 0.3787815996684625, 0.3787815996684625)
(2412, 0.37893864013267, 0.37893864013267)


In [10]:
df_test["grades_0"] = get_test_predictions(test_lines,model)

### For grade 1

In [11]:
df['p_class'] = df.apply(lambda x:label_fasttext(x,'edited_head_line','grades_1'),axis=1)

In [12]:
kfold = StratifiedKFold(4, True)
for train_index, test_index in kfold.split(df,df["grades_1"]):
    write(train_index,test_index,'p_class')
    model = fasttext.train_supervised('train.txt',lr = 0.1,epoch=7)
    print(model.test('test.txt'))

(2414, 0.3976801988400994, 0.3976801988400994)
(2413, 0.39121425611272276, 0.39121425611272276)
(2413, 0.39328636552009943, 0.39328636552009943)
(2412, 0.3951077943615257, 0.3951077943615257)


In [13]:
df_test["grades_1"] = get_test_predictions(test_lines,model)

### For grade 2

In [14]:
df['p_class'] = df.apply(lambda x:label_fasttext(x,'edited_head_line','grades_2'),axis=1)

In [15]:
kfold = StratifiedKFold(4, True)
for train_index, test_index in kfold.split(df,df["grades_2"]):
    write(train_index,test_index,'p_class')
    model = fasttext.train_supervised('train.txt',lr = 0.1,epoch=7)
    print(model.test('test.txt'))

(2415, 0.43643892339544516, 0.43643892339544516)
(2413, 0.43389970990468296, 0.43389970990468296)
(2412, 0.4490049751243781, 0.4490049751243781)
(2412, 0.44112769485903813, 0.44112769485903813)


In [16]:
df_test["grades_2"] = get_test_predictions(test_lines,model)

### For grade 3

In [17]:
df['p_class'] = df.apply(lambda x:label_fasttext(x,'edited_head_line','grades_3'),axis=1)

In [18]:
kfold = StratifiedKFold(4, True)
for train_index, test_index in kfold.split(df,df["grades_3"]):
    write(train_index,test_index,'p_class')
    model = fasttext.train_supervised('train.txt',lr = 0.1,epoch=7)
    print(model.test('test.txt'))

(2414, 0.6031483015741508, 0.6031483015741508)
(2414, 0.6197183098591549, 0.6197183098591549)
(2413, 0.6249481972648155, 0.6249481972648155)
(2411, 0.6233927830775612, 0.6233927830775612)


In [19]:
df_test["grades_3"] = get_test_predictions(test_lines,model)

### For grade 4

In [20]:
df['p_class'] = df.apply(lambda x:label_fasttext(x,'edited_head_line','grades_4'),axis=1)

In [21]:
kfold = StratifiedKFold(4, True)
for train_index, test_index in kfold.split(df,df["grades_4"]):
    write(train_index,test_index,'p_class')
    model = fasttext.train_supervised('train.txt',lr = 0.1,epoch=7)
    print(model.test('test.txt'))

(2415, 0.8484472049689441, 0.8484472049689441)
(2414, 0.8487986743993372, 0.8487986743993372)
(2412, 0.8499170812603648, 0.8499170812603648)
(2411, 0.8490253007051016, 0.8490253007051016)


In [22]:
df_test["grades_4"] = get_test_predictions(test_lines,model)

In [23]:
df_test.grades_0.value_counts()

2    2206
3     213
Name: grades_0, dtype: int64

In [24]:
df_test.grades_1.value_counts()

1    1834
2     541
0      38
3       6
Name: grades_1, dtype: int64

In [25]:
df_test.grades_2.value_counts()

1    1547
0     864
2       8
Name: grades_2, dtype: int64

In [26]:
df_test.grades_3.value_counts()

0    2286
1     133
Name: grades_3, dtype: int64

In [27]:
df_test.grades_4.value_counts()

0    2417
1       2
Name: grades_4, dtype: int64

In [28]:
df_test["meanGrades"] = df_test[["grades_0","grades_1","grades_2","grades_3","grades_4"]].mean(axis=1)

In [29]:
df_test.head(5)

Unnamed: 0,id,original,edit,edited_head_line,original_cleaned,grades_0,grades_1,grades_2,grades_3,grades_4,meanGrades
0,1723,Thousands of gay and bisexual <men/> convicted...,swans,thousands of gay and bisexual swans convicted ...,thousands of gay and bisexual men convicted of...,2,1,1,0,0,0.8
1,12736,Special <prosecutor/> appointed to Trump Russia,chef,special chef appointed to trump russia,special prosecutor appointed to trump russia,2,2,1,0,0,1.0
2,12274,Spanish police detain man and search Ripoll ad...,squad,spanish police detain man and search ripoll ad...,spanish police detain man and search ripoll ad...,3,1,1,0,0,1.0
3,8823,N.Y. Times <reprimands/> reporter for sharing ...,applauds,n.y. times applauds reporter for sharing unfou...,n.y. times reprimands reporter for sharing unf...,2,2,1,0,0,1.0
4,5087,Vladimir Putin Releases Video Simulation Of Ru...,balloon,vladimir putin releases video simulation of ru...,vladimir putin releases video simulation of ru...,2,1,0,0,0,0.6


### Kfold for regression

In [30]:
from sklearn.metrics import mean_squared_error
from math import sqrt

kfold = KFold(4, True)
all_grades = ["grades_0","grades_1","grades_2","grades_3","grades_4"]
for train_index, test_index in kfold.split(df):
    df_dev = df.iloc[test_index]
    
    for grade in all_grades:
        df['p_class'] = df.apply(lambda x:label_fasttext(x,'edited_head_line',grade),axis=1)
        write(train_index,test_index,'p_class')
        cv_test_lines = df["edited_head_line"].values[test_index]
        model = fasttext.train_supervised('train.txt',lr = 0.1,epoch=7)
        
        
        df_dev[grade] = get_test_predictions(cv_test_lines,model)
        
    df_dev["mean_grade_pred"] = df_dev[all_grades].mean(axis=1)
    mean_squared_error
    print(sqrt(mean_squared_error(df_dev["mean_grade_pred"],df_dev["meanGrade"])))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0.5879505661310666


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0.6111321667288219


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0.590682593473775


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


0.5985219934898562


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [31]:
df_dev.head()

Unnamed: 0,id,original,edit,grades,meanGrade,grade_round,grades_0,grades_1,grades_2,grades_3,grades_4,edited_head_line,original_cleaned,original_plus_edited,p_class,mean_grade_pred
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2,0,2,1,1,0,0,france is hunting down its citizens who joined...,france is hunting down its citizens who joined...,france is hunting down its citizens who joined...,__label__0 france is hunting down its citizen...,0.8
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0,1,2,1,0,0,0,iceland pm calls snap vote as pedophile furor ...,iceland pm calls snap vote as pedophile furor ...,iceland pm calls snap vote as pedophile furor ...,__label__0 iceland pm calls snap vote as pedo...,0.6
12,7614,Trump 's 2nd Nominee for <Army/> Secretary Wit...,Class,22100,1.0,1,2,2,1,0,0,trump 's 2nd nominee for class secretary withd...,trump 's 2nd nominee for army secretary withdraws,trump 's 2nd nominee for army secretary withdr...,__label__0 trump 's 2nd nominee for class sec...,1.0
15,14549,' Chibok <girls/> ' reunited with families,salamis,10000,0.2,0,2,1,1,0,0,chibok salamis reunited with families,chibok girls reunited with families,chibok girls reunited with familieschibok sala...,__label__0 chibok salamis reunited with families,0.8
19,6822,Erdogan Rejects Arab Demands ; Turkish <Troops...,Turkeys,22110,1.2,1,2,1,1,0,0,erdogan rejects arab demands ; turkish turkeys...,erdogan rejects arab demands ; turkish troops ...,erdogan rejects arab demands ; turkish troops ...,__label__0 erdogan rejects arab demands ; tur...,0.8


### Try stacking instead of average

In [32]:
ntrain = df.shape[0]
ntest = df_test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 4 # set folds for out-of-fold prediction
# kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
        
def write1(df,train,test,columns,dev=False):
    if dev:        
        with open('test.txt','w',encoding='utf8') as write:
            write.writelines('\n'.join(df[columns[0]].tolist()))
    else:
        with open('train.txt','w',encoding='utf8') as write:
            write.writelines('\n'.join(df.loc[train,columns[0]].tolist()))
        with open('dev.txt','w',encoding='utf8') as write:
            write.writelines('\n'.join(df.loc[test,columns[1]].tolist()))        

In [33]:
def get_oof(x_train, x_test,grade):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest ))
    
    skf = StratifiedKFold(n_splits=4)
    for i, (train_index, test_index) in enumerate(skf.split(df, df[grade])):
        print("Doing for fold " + str(i))
        x_tr = x_train.iloc[train_index]
        x_te = x_train.iloc[test_index]
        
        x_tr['p_class'] = x_tr.apply(lambda x:label_fasttext(x,'edited_head_line',grade),axis=1)
        write1(x_train,train_index,test_index,['p_class',"edited_head_line"])
        write1(x_test,[],[],['edited_head_line'],True)
        cv_test_lines = x_te["edited_head_line"].values
        test_lines = df_test["edited_head_line"].values
        model = fasttext.train_supervised('train.txt',lr = 0.1,epoch=7)
        
#         clf = LinearSVC(random_state=0,max_iter=10000,verbose=2)
#         clf.fit(x_tr, y_tr)

        oof_train[test_index] = get_test_predictions(cv_test_lines,model)
        oof_test_skf[i, :] = get_test_predictions(test_lines,model)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [34]:
et_oof_train_grade0, et_oof_test_grade0 = get_oof(df, df_test,"grades_0")
et_oof_train_grade1, et_oof_test_grade1 = get_oof(df, df_test,"grades_1")
et_oof_train_grade2, et_oof_test_grade2 = get_oof(df, df_test,"grades_2")
et_oof_train_grade3, et_oof_test_grade3 = get_oof(df, df_test,"grades_3")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Doing for fold 0
Doing for fold 1
Doing for fold 2
Doing for fold 3
Doing for fold 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Doing for fold 1
Doing for fold 2
Doing for fold 3
Doing for fold 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Doing for fold 1
Doing for fold 2
Doing for fold 3
Doing for fold 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Doing for fold 1
Doing for fold 2
Doing for fold 3


In [35]:
et_oof_train_grade0.shape

(9652, 1)

In [36]:
et_oof_test_grade0.shape

(2419, 1)

In [37]:
x_train = np.concatenate(( et_oof_train_grade0, et_oof_train_grade1,et_oof_train_grade2 , et_oof_train_grade3), axis=1)
x_test = np.concatenate(( et_oof_test_grade0, et_oof_test_grade1, et_oof_test_grade2, et_oof_test_grade3), axis=1)

In [38]:
# import xgboost as xgb

# gbm = xgb.XGBClassifier(
#     #learning_rate = 0.02,
#  n_estimators= 2000,
#  max_depth= 4,
#  min_child_weight= 2,
#  #gamma=1,
#  gamma=0.9,                        
#  subsample=0.8,
#  colsample_bytree=0.8,
#  objective= 'binary:logistic',
#  nthread= -1,
#  scale_pos_weight=1).fit(x_train, y_train)
# predictions = gbm.predict(x_test)

In [39]:
import xgboost as xgb

In [40]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
skf = KFold(n_splits=4)
X = x_train
y = df["meanGrade"]

In [41]:
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = model = xgb.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42) 
    clf.fit(X_train,y_train)
    y_preds = clf.predict(X_test)
    print(sqrt(mean_squared_error(y_preds,y_test)))
    

  if getattr(data, 'base', None) is not None and \


0.5814319871153552


  if getattr(data, 'base', None) is not None and \


0.577066058663905


  if getattr(data, 'base', None) is not None and \


0.5805780653898097


  if getattr(data, 'base', None) is not None and \


0.5946042684302414


In [42]:
X_train

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [43]:
y

0       0.2
1       1.6
2       1.0
3       0.4
4       0.0
       ... 
9647    0.0
9648    0.4
9649    0.6
9650    1.4
9651    0.4
Name: meanGrade, Length: 9652, dtype: float64