In [33]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
from sklearn.ensemble import GradientBoostingClassifier
from nltk import word_tokenize,sent_tokenize
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score,GridSearchCV

# 1. Classification: Who Controls this Account

### Data exploration and clean

In [2]:
df = pd.read_csv("data/tweets.tsv",sep="\t",names=["tweet_id","user_handle","tweet_text","timestamp","device"])

In [3]:
df.shape

(4020, 5)

In [4]:
df

Unnamed: 0,tweet_id,user_handle,tweet_text,timestamp,device
0,845974102619906048,realDonaldTrump,Democrats are smiling in D.C. that the Freedom...,2017-03-26 15:21:58,iphone
1,846166053663191040,realDonaldTrump,General Kelly is doing a great job at the bord...,2017-03-27 04:04:42,iphone
2,835814988686233601,realDonaldTrump,"The race for DNC Chairman was, of course, tota...",2017-02-26 13:33:16,android
3,835817351178301440,realDonaldTrump,For first time the failing @nytimes will take ...,2017-02-26 13:42:39,android
4,835916511944523777,realDonaldTrump,"Russia talk is FAKE NEWS put out by the Dems, ...",2017-02-26 20:16:41,android
5,835946001873657858,realDonaldTrump,Big dinner with Governors tonight at White Hou...,2017-02-26 22:13:52,android
6,820251730407473153,realDonaldTrump,Congressman John Lewis should spend more time ...,2017-01-14 14:50:26,android
7,820255947956383744,realDonaldTrump,mention crime infested) rather than falsely co...,2017-01-14 15:07:12,android
8,820257714362314753,realDonaldTrump,INTELLIGENCE INSIDERS NOW CLAIM THE TRUMP DOSS...,2017-01-14 15:14:13,android
9,820425770925338624,realDonaldTrump,Congressman John Lewis should finally focus on...,2017-01-15 02:22:01,android


looks like there are NaNs in the data, which means that the data was not clear as said or that was not formatted as real tsv.  
i am removing the rows with NaN values for now.

In [5]:
df.dropna(inplace=True)

In [6]:
df.shape

(3167, 5)

looks like we lost about 50 lines

In [7]:
df

Unnamed: 0,tweet_id,user_handle,tweet_text,timestamp,device
0,845974102619906048,realDonaldTrump,Democrats are smiling in D.C. that the Freedom...,2017-03-26 15:21:58,iphone
1,846166053663191040,realDonaldTrump,General Kelly is doing a great job at the bord...,2017-03-27 04:04:42,iphone
2,835814988686233601,realDonaldTrump,"The race for DNC Chairman was, of course, tota...",2017-02-26 13:33:16,android
3,835817351178301440,realDonaldTrump,For first time the failing @nytimes will take ...,2017-02-26 13:42:39,android
4,835916511944523777,realDonaldTrump,"Russia talk is FAKE NEWS put out by the Dems, ...",2017-02-26 20:16:41,android
5,835946001873657858,realDonaldTrump,Big dinner with Governors tonight at White Hou...,2017-02-26 22:13:52,android
6,820251730407473153,realDonaldTrump,Congressman John Lewis should spend more time ...,2017-01-14 14:50:26,android
7,820255947956383744,realDonaldTrump,mention crime infested) rather than falsely co...,2017-01-14 15:07:12,android
8,820257714362314753,realDonaldTrump,INTELLIGENCE INSIDERS NOW CLAIM THE TRUMP DOSS...,2017-01-14 15:14:13,android
9,820425770925338624,realDonaldTrump,Congressman John Lewis should finally focus on...,2017-01-15 02:22:01,android


its been said that there are multiple devices used. i want to check which unique devices.

In [8]:
df["device"].unique()

array(['iphone', 'android',
       '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>',
       '<a href="http://instagram.com" rel="nofollow">Instagram</a>',
       '<a href="http://www.twitter.com" rel="nofollow">Twitter for BlackBerry</a>',
       '<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>',
       '<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>',
       '<a href="http://www.facebook.com/twitter" rel="nofollow">Facebook</a>',
       '<a href="https://periscope.tv" rel="nofollow">Periscope.TV</a>'], dtype=object)

In [9]:
def change_device(df):
    """
    change the devices to a single word to represent it, group the different tweet sources
    """
    def change(device_line):
        if 'Twitter Web Client' in device_line:
            return 'twitter_web'
        elif 'Twitter for BlackBerry' in device_line:
            return 'twitter_bb'
        elif 'Twitter for iPad' in device_line:
            return 'twitter_ipad'
        elif 'TweetDeck' in device_line:
            return 'twitter_deck'
        elif 'instagram' in device_line:
            return 'instagram'
        elif 'periscope' in device_line:
            return 'periscope'
        elif 'Facebook' in device_line:
            return 'facebook'
        else:
            return device_line
    df["device"] = df["device"].apply(change)
    return df

so we're going to change the devices names for more convinient representation

In [10]:
change_device(df)

Unnamed: 0,tweet_id,user_handle,tweet_text,timestamp,device
0,845974102619906048,realDonaldTrump,Democrats are smiling in D.C. that the Freedom...,2017-03-26 15:21:58,iphone
1,846166053663191040,realDonaldTrump,General Kelly is doing a great job at the bord...,2017-03-27 04:04:42,iphone
2,835814988686233601,realDonaldTrump,"The race for DNC Chairman was, of course, tota...",2017-02-26 13:33:16,android
3,835817351178301440,realDonaldTrump,For first time the failing @nytimes will take ...,2017-02-26 13:42:39,android
4,835916511944523777,realDonaldTrump,"Russia talk is FAKE NEWS put out by the Dems, ...",2017-02-26 20:16:41,android
5,835946001873657858,realDonaldTrump,Big dinner with Governors tonight at White Hou...,2017-02-26 22:13:52,android
6,820251730407473153,realDonaldTrump,Congressman John Lewis should spend more time ...,2017-01-14 14:50:26,android
7,820255947956383744,realDonaldTrump,mention crime infested) rather than falsely co...,2017-01-14 15:07:12,android
8,820257714362314753,realDonaldTrump,INTELLIGENCE INSIDERS NOW CLAIM THE TRUMP DOSS...,2017-01-14 15:14:13,android
9,820425770925338624,realDonaldTrump,Congressman John Lewis should finally focus on...,2017-01-15 02:22:01,android


In [11]:
df["device"].unique()

array(['iphone', 'android', 'twitter_web', 'instagram', 'twitter_bb',
       'twitter_ipad', 'twitter_deck', 'facebook', 'periscope'], dtype=object)

In [12]:
df["user_handle"].unique()

array(['realDonaldTrump', 'PressSec', 'POTUS'], dtype=object)

In [13]:
df.sort_values(by='tweet_id')

Unnamed: 0,tweet_id,user_handle,tweet_text,timestamp,device
1053,593170823411699712,PressSec,RT @WhiteHouse: Watch President Obama speak on...,2015-04-29 00:51:59,twitter_web
1148,617108111703322624,realDonaldTrump,@marcorubio what do you say to the family of K...,2015-07-04 02:10:13,twitter_bb
1149,617112526988423170,realDonaldTrump,@marcorubio what do you say to the family of K...,2015-07-04 02:27:46,twitter_bb
1831,618270248811266048,realDonaldTrump,@bdean1: If I had a dollar for every time a po...,2015-07-07 07:08:08,android
1832,618417050193653760,realDonaldTrump,@hallmarkm1 @realDonaldTrump is the only Presi...,2015-07-07 16:51:29,twitter_web
3051,618742043444166656,realDonaldTrump,I will be in California this weekend making a ...,2015-07-08 14:22:53,android
3052,618774181510033408,realDonaldTrump,"Via @Newsmax_Media by @ChrisRuddyNMX: ""Donald ...",2015-07-08 16:30:35,twitter_web
2309,619116650613329920,realDonaldTrump,@williamonlyrent: @realDonaldTrump @krauthamme...,2015-07-09 15:11:26,android
2310,619116814040219648,realDonaldTrump,@mcgranejt: @realDonaldTrump @krauthammer @Chu...,2015-07-09 15:12:05,android
2311,619116974883389440,realDonaldTrump,@DavidBougs: @krauthammer Krauthammer sold som...,2015-07-09 15:12:44,android


### feature extraction

i plan to extract features numerical faetures about the text, the number of offensive words has been used in them.  
the offensive words comes from: https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en, https://gist.github.com/jamiew/1112488

In [14]:
def get_offensive1():
    """
    get the words from the file, only single words. filter combinations
    """
    words = []
    with open("data/offensive1.txt",'r') as f:
        words = [x.strip("\n") for x in f if " " not in x]
    return words

In [15]:
def get_offensive2():
    """
    get the words from the file, only single words. filter combinations
    """
    words = []
    with open("data/offensive2.txt",'r') as f:
        for line in f:
            line = line.strip("\n")
            parts = line.split(":")
            if len(line)!=0:
                word = parts[0].strip('"').strip(" ")
                if " " not in word:
                    words.append(word)
    return words

combine the offensive words

In [16]:
offensive_words = set(get_offensive1()+get_offensive2())

In [17]:
def extract_features(x):
    """
    extract the average/min/max for the word and sentences lenghts
    """
    sentences = sent_tokenize(x)
    words = word_tokenize(x)
    
    senteces_sizes =[len(word_tokenize(sentence)) for sentence in sentences]
    words_sizes = [len(word) for word in words]
    num_of_sentences = len(sentences)
    num_of_words = len(words)
    avg_sentences_length = np.mean(senteces_sizes)
    avg_words_length = np.mean(words_sizes)
    min_word_length = min(words_sizes)
    min_sentence_length = min(senteces_sizes)
    max_word_length = max(words_sizes)
    max_sentence_length = max(senteces_sizes)
    num_off_words = sum([1 for word in words if word in offensive_words])
    return np.asarray([num_of_sentences,min_sentence_length,avg_sentences_length,max_sentence_length,num_of_words,min_word_length,avg_words_length,max_word_length,num_off_words])

In [18]:
tmp_series = df["tweet_text"].apply(extract_features)

In [19]:
df["num_of_sentences"],df["min_sentence_length"],df["avg_sentences_length"],df["max_sentence_length"],df["num_of_words"],df["min_word_length"],df["avg_words_length"],df["max_word_length"],df["num_off_words"] = map(lambda x:x[0],tmp_series),map(lambda x:x[1],tmp_series),map(lambda x:x[2],tmp_series),map(lambda x:x[3],tmp_series),map(lambda x:x[4],tmp_series),map(lambda x:x[5],tmp_series),map(lambda x:x[6],tmp_series),map(lambda x:x[7],tmp_series),map(lambda x:x[8],tmp_series)

label all the android tweets as trumps and all the others as not

In [20]:
df["is_trump"] = df["device"]=="android"

In [21]:
df_known_not_trump = df[df["user_handle"]=="PressSec"]
df_known_trump = df[df["user_handle"]=="POTUS"]
df_unknown = df[df["user_handle"]=="realDonaldTrump"]

In [22]:
df_unknown["is_trump"].describe()

count     3154
unique       2
top       True
freq      2222
Name: is_trump, dtype: object

let's calculate the base line of a random model

In [30]:
baseline = float(2222)/3154

### models training

In [31]:
X = df[['num_of_sentences','min_sentence_length','avg_sentences_length','max_sentence_length','num_of_words','min_word_length','avg_words_length','max_word_length','num_off_words']]
y = df["is_trump"]

#### LogisticRegression

In [61]:
parameters = {'C':[1,10],'solver':('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga')}
lr = LogisticRegression()
clf_rl = GridSearchCV(lr, parameters,cv=5)

In [62]:
clf_rl.fit(X,y)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1, 10], 'solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [66]:
best_rl = clf_rl.best_estimator_
print best_rl
print clf_rl.best_score_

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)
0.763814335333


#### SVM

In [70]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = SVC()
clf_svc = GridSearchCV(svc, parameters,cv=5)

In [71]:
clf_svc.fit(X,y)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'kernel': ('linear', 'rbf'), 'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [72]:
best_svc = clf_svc.best_estimator_
print best_svc
print clf_svc.best_score_

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.785917271866


#### GradientBoostingClassifier

In [75]:
parameters = {'loss':('deviance', 'exponential'), 'learning_rate':[0.01, 0.1],"n_estimators":[100,300,500,700,1000],"max_depth":[3,6]}
gdb = GradientBoostingClassifier()
clf_gdb = GridSearchCV(gdb, parameters,cv=5)

In [78]:
clf_gdb.fit(X,y)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100, 300, 500, 700, 1000], 'loss': ('deviance', 'exponential'), 'learning_rate': [0.01, 0.1], 'max_depth': [3, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [79]:
best_gdb = clf_gdb.best_estimator_
print best_gdb
print clf_gdb.best_score_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
0.797916008841
