In [0]:
!pip install -U -q PyDrive
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
import pandas as pd
f_ = drive.CreateFile({'id': '1UJDXBrLmpfFL1C9mAdxi-klHs48gtHTv'})
f_.GetContentFile('train.csv')
df = pd.read_csv('train.csv')

In [0]:
# Resampling and train-validation split
from sklearn.model_selection import train_test_split


train_df, val_df = train_test_split(df, test_size=0.05, random_state=42)

sample_size = df[df.target == 1].shape[0]
train_df = train_df[train_df.target == 1].append(train_df[train_df.target == 0].sample(sample_size * 5, random_state=42)).reset_index()

In [0]:
import re
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
def filter_numerical(sentence):
  return re.sub(r'\b\d+(?:\.\d+)?\s+', '', sentence)

def remove_punctuation(sentence):
    return re.sub(r'[^\w\s]', '', sentence)

def filter_stopwords(sentence):
  return [word for word in sentence if word not in stopwords]

def lemmatize_text(sentence):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in sentence]

def stem_text(sentence):
    ps = PorterStemmer()
    return [ps.stem(word) for word in sentence]

def preprocess_text(df):
    df['processed'] = df['question_text'].apply(filter_numerical)
    df['processed'] = df['processed'].apply(remove_punctuation)

    df['processed'] = df['processed'].apply(word_tokenize)
    df['processed'] = df['processed'].apply(filter_stopwords)
    df['processed'] = df['processed'].apply(lemmatize_text)
    df['processed'] = df['processed'].apply(stem_text)
    return df

train_df = preprocess_text(train_df)
val_df = preprocess_text(val_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

## Logistic regresion  over TF-IDF representation


In [0]:
# TF-IDF representation
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vec = TfidfVectorizer(min_df=3,
                             max_features = 20_000,
                             analyzer="word",
                             ngram_range=(1,3))

tf_idf_train = tf_idf_vec.fit_transform(list(train_df["processed"].map(lambda tokens: " ".join(tokens))))
tf_idf_val = tf_idf_vec.transform(list(val_df["processed"].map(lambda tokens: " ".join(tokens))))

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import f1_score

lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(tf_idf_train, train_df.target)
pred = lr.predict(tf_idf_val)

In [0]:
train_pr = lr.predict(tf_idf_train)

print("On train set:")
print(cm(train_df.target, train_pr))
print(f1_score(train_df.target, train_pr))

print("On test set:")
print(cm(val_df.target, pred))
print(f1_score(val_df.target, pred))

On train set:
[[392219  11831]
 [ 24435  52433]]
0.7430348893234702
On test set:
[[59363  2002]
 [ 1288  2654]]
0.6173528727611073


In [0]:
# Assumption:fails to generalize because there are too much features
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

svd = TruncatedSVD(n_components=500, random_state=42)
reduced_tf_idf_train = svd.fit_transform(tf_idf_train)
reduced_tf_idf_val = svd.transform(tf_idf_val)

sc = StandardScaler().fit(reduced_tf_idf_train)
reduced_tf_idf_train = sc.transform(reduced_tf_idf_train)
reduced_tf_idf_val = sc.transform(reduced_tf_idf_val)

In [0]:
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(reduced_tf_idf_train, train_df.target)
pred = lr.predict(reduced_tf_idf_val)

print(cm(val_df.target, pred))
print(f1_score(val_df.target, pred))

[[59133  2232]
 [ 1794  2148]]
0.5162220620043259


In [0]:
# Adding sentiment feature
from textblob import TextBlob

train_df['polarity'] = train_df['processed'].map(lambda text: TextBlob(" ".join(text)).sentiment.polarity)
val_df['polarity'] = val_df['processed'].map(lambda text: TextBlob(" ".join(text)).sentiment.polarity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [0]:
from scipy.sparse import hstack
import numpy as np

lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(hstack((tf_idf_train ,np.array(train_df['polarity'])[:,None])), train_df.target)
pred = lr.predict(hstack((tf_idf_val ,np.array(val_df['polarity'])[:,None])))
train_pr = lr.predict(hstack((tf_idf_train ,np.array(train_df['polarity'])[:,None])))

print("On train set:")
print(cm(train_df.target, train_pr))
print(f1_score(train_df.target, train_pr))

print("On test set:")
print(cm(val_df.target, pred))
print(f1_score(val_df.target, pred))

On train set:
[[392197  11853]
 [ 24439  52429]]
0.7428834573149132
On test set:
[[59365  2000]
 [ 1288  2654]]
0.6174965100046533


In [0]:
def contains_trigger(text):
    # Selected words which are especially frequent w
    triggers = ['woman', 'trump', 'like', 'indian', 'muslim', 'american', 'men', 
                'white', 'black', 'liber', 'hate', 'like', 'support', 'sex', 'kill',
                'christian', 'democrat', 'presid', 'believ', 'jew', 'hindu', 'doe',
                'gay', 'child', 'donald', 'true', 'evil', 'conservat']
    for word in text:
      if word in triggers:
        return 1
    return 0

train_df['contains_trigger'] = train_df.processed.apply(contains_trigger)
val_df['contains_trigger'] = val_df.processed.apply(contains_trigger)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [0]:
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(hstack((tf_idf_train ,np.array(train_df['contains_trigger'])[:,None])), train_df.target)
pred = lr.predict(hstack((tf_idf_val ,np.array(val_df['contains_trigger'])[:,None])))
train_pr = lr.predict(hstack((tf_idf_train ,np.array(train_df['contains_trigger'])[:,None])))

print("On train set:")
print(cm(train_df.target, train_pr))
print(f1_score(train_df.target, train_pr))

print("On test set:")
print(cm(val_df.target, pred))
print(f1_score(val_df.target, pred))

On train set:
[[392015  12035]
 [ 24087  52781]]
0.7450523700629571
On test set:
[[59313  2052]
 [ 1286  2656]]
0.6141040462427745
