# Assignment 1 
## Niall Buckley
## 115571753

In [54]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from pandas.tools.plotting import scatter_matrix
from sklearn.feature_extraction import stop_words
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Imputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_decomposition import CCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold
from sklearn.dummy import DummyClassifier

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, dtype=None):
        self.attribute_names = attribute_names
        self.dtype = dtype
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_selected = X[self.attribute_names]
        if self.dtype:
            return X_selected.astype(self.dtype).values
        return X_selected.values

class FeatureBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, features_values):
        self.features_values = features_values
        self.num_features = len(features_values)
        self.labelencodings = [LabelEncoder().fit(feature_values) for feature_values in features_values]
        self.onehotencoder = OneHotEncoder(sparse=False,
            n_values=[len(feature_values) for feature_values in features_values])
        self.last_indexes = np.cumsum([len(feature_values) - 1 for feature_values in self.features_values])
    def fit(self, X, y=None):
        for i in range(0, self.num_features):
            X[:, i] = self.labelencodings[i].transform(X[:, i])
        return self.onehotencoder.fit(X)
    def transform(self, X, y=None):
        for i in range(0, self.num_features):
            X[:, i] = self.labelencodings[i].transform(X[:, i])
        onehotencoded = self.onehotencoder.transform(X)
        return np.delete(onehotencoded, self.last_indexes, axis=1)
    def fit_transform(self, X, y=None):
        onehotencoded = self.fit(X).transform(X)
        return np.delete(onehotencoded, self.last_indexes, axis=1)
    def get_params(self, deep=True):
        return {"features_values" : self.features_values}
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            self.setattr(parameter, value)
        return self
    
class MissingValueImputer(Imputer):
    def __init__(self, **kwargs):
        Imputer.__init__(self, **kwargs)
    def fit(self, X, y=None):
        if self.strategy == "most_frequent":
            self.fills = pd.DataFrame(X).mode(axis=0).squeeze() 
            return self
        else:
            return Imputer.fit(self, X, y=y)
    def transform(self, X):
        if hasattr(self, "fills"):
            return pd.DataFrame(X).fillna(self.fills).values
        else:
            return Imputer.transform(self, X)

In [57]:
df = pd.read_csv('dataset_statements.tsv', sep='\t')

In [58]:
# Shuffle
df = df.take(np.random.permutation(len(df)))

### Explore the dataset

In [59]:
df.shape

(12791, 14)

In [60]:
df.dtypes

id                    int64
label                object
statement            object
subject              object
speaker              object
job                  object
state                object
party                object
num_barely_trues    float64
num_falses          float64
num_half_trues      float64
num_mostly_trues    float64
num_pants_fires     float64
location             object
dtype: object

In [61]:
df.columns

Index(['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state',
       'party', 'num_barely_trues', 'num_falses', 'num_half_trues',
       'num_mostly_trues', 'num_pants_fires', 'location'],
      dtype='object')

In [62]:
for cols in df.columns:
    print(df[cols].isnull().sum())

0
0
0
2
2
3567
2749
2
2
2
2
2
2
131


In [63]:
df.describe(include= "all")

Unnamed: 0,id,label,statement,subject,speaker,job,state,party,num_barely_trues,num_falses,num_half_trues,num_mostly_trues,num_pants_fires,location
count,12791.0,12791,12791,12789,12789,9224,10042,12789,12789.0,12789.0,12789.0,12789.0,12789.0,12660
unique,,6,12765,4534,3309,1355,85,24,,,,,,5142
top,,half-true,On changing the rules for filibusters on presi...,health-care,barack-obama,President,Texas,republican,,,,,,a news release
freq,,2627,3,474,611,615,1260,5665,,,,,,309
mean,6773.300211,,,,,,,,11.583939,13.359059,17.185785,16.49785,6.251388,
std,3906.695086,,,,,,,,18.978037,24.140086,35.847678,36.165276,16.180777,
min,1.0,,,,,,,,0.0,0.0,0.0,0.0,0.0,
25%,3368.5,,,,,,,,0.0,0.0,0.0,0.0,0.0,
50%,6818.0,,,,,,,,2.0,2.0,3.0,3.0,1.0,
75%,10145.5,,,,,,,,12.0,15.0,13.0,12.0,5.0,


### Clean the dataet
#### I will try to keep as much useful information from the dataset as possible

ID will not be necessary for this project as it's not very informative.

In [64]:
df.drop('id', axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

#### clean speaker column
speaker column looks a little strange <br>
President frequecy = 615, yet obama only appears in dataset 611 times <br>
were 3 speakers incorrectly labelled president?

In [None]:
x = df[(df['job'] == 'President') & (df['speaker'] != 'barack-obama')].copy()
x['speaker']

The 3 non obama speakers were George-Bush which makes sense.

#### clean subject column
It looks O.K. from the describe except the count for speaker does not match the dataset count seen in shape
Lets see if it has any nans

In [66]:
#Subject only contains two NaNs and the rows are pretty sparse of information
#so it can go
subject_nans = (df[(df["subject"].isnull() == 1)]).copy()
subject_nans

Unnamed: 0,label,statement,subject,speaker,job,state,party,num_barely_trues,num_falses,num_half_trues,num_mostly_trues,num_pants_fires,location
4380,False,"Joe, I keep hearing you every morning talking ...",,,,,,,,,,,
7277,False,The fact is that although we have had a presid...,,,,,,,,,,,


It does. It also shows the 2 NaNs shown for other columns too (including party, subject, speaker).

Let's remove these as it is so sparse of data it couldn't be used for much anyway

subject column contains a lot of useful information but it is too diverse, first sepeate each subject into its own string in an array, now the subject column contains arrays of subject strings. This is not great for getting unique features and trying to find a pattern so we must choose a subject at random (we can't just delete the first one as it's alphabetical). So we get the size of the array and randomly pick a subject from the list to assign to the statement. <br>

In [67]:
df.dropna(subset=["subject"], inplace=True)
df.reset_index(drop=True, inplace=True)

In [68]:
#df['subject'] = df['subject'].astype(str)
df['subject'] = [subs.split(',') for subs in df['subject'].values]

In [69]:
from random import randint
df['subject'] = [i[randint(0,len(i)-1)] for i in df['subject'].values]

Still contains a lot of non numeric or alphabetic data <br>
Need to get rid of this.

In [70]:
#code for lower taken from https://www.programiz.com/python-programming/methods/string/lower
df['subject'] = df['subject'].str.lower()
df['subject'] = df['subject'].replace(['[^a-zA-Z0-9]+'], [' '], regex=True)
df.reset_index(drop=True, inplace=True)

#### clean jobs column <br>
it's clearly a dirty feature as only 9224 / 12791 contain information. <br>
lets look at these null values <br>

In [73]:
job_nans = (df[(df["job"].isnull() == 1)]).copy()
job_nans['party'].unique()

array(['democrat', 'republican', 'libertarian', 'none', 'organization',
       'independent', 'business-leader', 'newsmaker',
       'county-commissioner', 'columnist', 'talk-show-host',
       'tea-party-member', 'activist', 'government-body', 'journalist',
       'labor-leader', 'state-official'], dtype=object)

Well we can see a lot of the nan jobs are politicians or work in media <br>
Lets try reduce the nans by compining the features <br>

In [74]:
df['jobs'] = np.where(df['job'].isnull(), df['party'], df['job'])

#### clean states column <br>
We see from the describe all table that its says there are 85 unique states despite the fact that there are only 50 states in America.


In [77]:
df["state"].unique()

array(['Maryland', 'Georgia', nan, 'Pennsylvania', 'New York',
       'Tennessee', 'Nevada', 'Washington, D.C.', 'New Jersey',
       'Wisconsin', 'California', 'Massachusetts', 'Illinois',
       'Minnesota', 'Alabama', 'Texas', 'Arizona', 'New Hampshire',
       'Ohio', 'Delaware', 'Florida', 'Oregon', 'New Mexico', 'Colorado',
       'North Carolina', 'Rhode Island', 'Missouri', 'Washington',
       'Washington, D.C. ', 'Iowa', 'Arkansas', 'Virginia', 'Connecticut',
       'Vermont', 'Alaska', 'Kentucky', 'Kansas', 'Louisiana',
       'South Carolina', 'Indiana', 'Michigan', 'California ', 'Qatar',
       'Georgia ', 'District of Columbia', 'Utah', 'West Virginia',
       'Virginia ', 'Montana', 'Virgiia', 'Colorado ', 'Wisconsin ',
       'Tennesse', 'Massachusetts ', 'Mississippi', 'None', 'Oregon ',
       'South Dakota', 'Oklahoma', 'Maine', 'Atlanta', 'Washington state',
       'Virgina', 'Nebraska', 'Unknown', 'ohio', 'Wyoming', 'Florida ',
       'Rhode Island ', 'Idaho', 'Un

A lot of state information was mistyped or entered incorrectly <br>
We will fix it by replacing the incorrectly entered data with better labels<br>
It also contains nans nones and unknows which will all become 'none' - I am assuming that these values have some significance, or something in common. <br>
The person who created this data set also included countries in the state column I changed this to 'Not America'. <br>
Also said Washing DC is a state altough it tecnically, is not.<br>

In [80]:
df['state'].replace(['Virginia ', 'Virginia director, Coalition to Stop Gun Violence', 'Virgiia', 'Virgina'], "Virginia", inplace = True)
df['state'].replace([['New York '], 'New York'],inplace = True)
df['state'].replace(['Washington, D.C.', 'Washington D.C.', 'Washington, D.C. ', 'District of Columbia'],'Washington DC', inplace = True)
df['state'].replace(['Georgia ', 'Georgia  '], 'Georgia', inplace = True)
df['state'].replace(['Wisconsin '],'Wisconsin', inplace = True)
df['state'].replace([np.nan, 'Unknown'], 'None', inplace = True)
df['state'].replace(['Washington state'],'Washington', inplace = True)
df['state'].replace(['Tex'] , "Texas", inplace = True)
df['state'].replace(["PA - Pennsylvania"], "Pennsylvania", inplace = True)
df['state'].replace(['Qatar', 'China','Russia', 'United Kingdom'],'Not America', inplace = True)
df['state'].replace(['New Hampshire '],'New Hampshire', inplace = True)
df['state'].replace(['ohio'],'Ohio', inplace = True)
df['state'].replace(['California '],'California', inplace = True)
df['state'].replace(['Oregon '],'Oregon', inplace = True)
df['state'].replace(['Illinois '],'Illinois', inplace = True)
df['state'].replace(['Massachusetts '],'Massachusetts', inplace = True)
df['state'].replace(['Colorado '],'Colorado', inplace = True)
df['state'].replace(['Florida '],'Florida', inplace = True)
df['state'].replace(['Rhode island', 'Rhode Island '],'Rhode Island', inplace = True)
df['state'].replace(['the United States'],df['state'].mode(), inplace = True)

array(['Maryland', 'Georgia', 'None', 'Pennsylvania', 'Tennessee',
       'Nevada', 'Washington DC', 'New Jersey', 'Wisconsin', 'California',
       'Massachusetts', 'Illinois', 'Minnesota', 'Alabama', 'Texas',
       'Arizona', 'New Hampshire', 'Ohio', 'Delaware', 'Florida',
       'Oregon', 'New Mexico', 'Colorado', 'North Carolina',
       'Rhode Island', 'Missouri', 'Washington', 'Iowa', 'Arkansas',
       'Virginia', 'Connecticut', 'Vermont', 'Alaska', 'Kentucky',
       'Kansas', 'Louisiana', 'South Carolina', 'Indiana', 'Michigan',
       'Not America', 'Utah', 'West Virginia', 'Montana', 'Tennesse',
       'Mississippi', 'South Dakota', 'Oklahoma', 'Maine', 'Atlanta',
       'Nebraska', 'Wyoming', 'Idaho', 'North Dakota', 'Hawaii'],
      dtype=object)

Are party values O.K?

In [81]:
df['party'].count()

12789

In [82]:
df['party'].unique()

array(['democrat', 'none', 'republican', 'libertarian', 'activist',
       'organization', 'independent', 'state-official', 'talk-show-host',
       'journalist', 'newsmaker', 'business-leader', 'columnist',
       'liberal-party-canada', 'county-commissioner', 'labor-leader',
       'green', 'democratic-farmer-labor', 'education-official',
       'tea-party-member', 'constitution-party', 'government-body',
       'Moderate', 'ocean-state-tea-party-action'], dtype=object)

Looks good, uses term party loosely but that's ok with me

#### Location <br>
Lets look at the nans now

In [None]:
location_nans = (df[(df["location"].isnull() == 1)]).copy()
#location_nans['state']

Fair to say we can remove them now. <br>
A lot of the nans for location have a value in state which could be considered a location so we'll use that so we donn't have to
remove too many nans

In [None]:
df['place'] = np.where(df['location'].isnull(), df['state'], df['location'])
#df['place']

Location could also be cleaner if we remove stop words and all non numeric or alphabetic letters from the feature <br>

In [88]:
#https://stackoverflow.com/questions/43358857/how-to-remove-special-characters-except-space-from-a-file-in-python
#stop = stop_words.ENGLISH_STOP_WORDS
stop = stop_words.ENGLISH_STOP_WORDS
df.dropna(subset=["place"], inplace=True)
df['place'] = df['place'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['place'] = df['place'].replace(['[^a-zA-Z0-9]+'], [' '], regex=True)
df['place'] = df['place'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 1]))
df.reset_index(drop=True, inplace=True)


Let's see the changes

In [None]:
df.drop('job', axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.drop('location', axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
#df.describe(include="all")

### Run the predictions <br>

In [90]:
#encode the labels
y = df["label"].values

encoder = LabelEncoder()

y_encoded = encoder.fit_transform(y)

Include various pipelines to run various parameter and strategys to try predict the class the input data belong to

In [93]:
nominal_features = ["place","subject","jobs", "speaker", "party"]
# Create the cross-entropy pipeline
pipeline = Pipeline([
        ("selector", DataFrameSelector(nominal_features)), 
        #("imputer", MissingValueImputer(missing_values="NaN", strategy="most_frequent")),
        ("binarizer", FeatureBinarizer([df[feature].unique() for feature in nominal_features])), 
        ("estimator", LogisticRegression(multi_class="multinomial", solver="newton-cg")) 
    ])

nominal_pipeline_with_CCA = Pipeline([
        ("selector", DataFrameSelector(nominal_features)),
        ("binarizer", FeatureBinarizer([df[feature].unique() for feature in nominal_features])),
        ("cca", CCA(n_components=1)),
        ("estimator", LogisticRegression())
    ])

# Create the one-versus-rest pipeline
ovr_pipeline = Pipeline([
        ("selector", DataFrameSelector(nominal_features)),
        ("binarizer", FeatureBinarizer([df[feature].unique() for feature in nominal_features])),
        ("estimator", LogisticRegression())
    ])

# Create the classifier
maj_pipeline = Pipeline([
        ("selector", DataFrameSelector(nominal_features)),
        ("binarizer", FeatureBinarizer([df[feature].unique() for feature in nominal_features])),
        ("estimator", DummyClassifier(strategy = "most_frequent"))
    ])
strat_pipeline = Pipeline([
        ("selector", DataFrameSelector(nominal_features)),  
        ("binarizer", FeatureBinarizer([df[feature].unique() for feature in nominal_features])),
        ("estimator", DummyClassifier())
    ])

In [94]:
# Create the object that splits the data
ss = ShuffleSplit(n_splits=1, train_size=0.8)
ssL = ShuffleSplit(n_splits=1, train_size=0.5)
ssH = ShuffleSplit(n_splits=1, train_size=0.90)
sss = StratifiedShuffleSplit(n_splits=1, train_size=0.8)
kf = KFold(n_splits = 10)



Here I'm trying to determine what stragety works best to predict the class the input data belongs to. The scoring is a percentage based on how many predictions were correct. <br>
I compare my predictions to the dummy classifiers, 
I imagine that holdout would be better suited to this dataset as it's quite large. And the compution time of k-fold would be unnecessarly huge for a data set of this size<br>
**I removed k-fold predictions as computation time was too long.** <br>
**I left in my best performing classifiers but you can see I experimented with parameters and methods.** <br>

In [96]:
#print("Cross entropy using k-fold: ", np.mean(cross_val_score(pipeline, df, y_encoded, scoring="accuracy", cv=10)))
print("One-versus-Rest: ", np.mean(cross_val_score(ovr_pipeline, df, y_encoded, scoring="accuracy", cv=ss)))
print("Majority-class classifier: ", np.mean(cross_val_score(maj_pipeline, df, y_encoded, scoring="accuracy", cv=10)))

One-versus-Rest:  0.23377638780297108
Majority-class classifier:  0.20541108212461062


In [None]:
#Compare with stratified results
#print("Cross entropy using k-fold: ", np.mean(cross_val_score(pipeline, df, y_encoded, scoring="accuracy", cv=10)))
#print("Cross entropy with holdout stratified shuffle: ", np.mean(cross_val_score(pipeline, df, y_encoded, scoring="accuracy", cv=sss)))
#print("Stratified-class classifier: ", np.mean(cross_val_score(strat_pipeline, df, y_encoded, scoring="accuracy", cv=10)))

In [97]:
#print("Cross entropy using holdout 50% training: ", np.mean(cross_val_score(pipeline, df, y_encoded, scoring="accuracy", cv=ssL)))
print("cross entropy multi-class: ", np.mean(cross_val_score(pipeline, df, y_encoded, scoring="accuracy", cv=ss)))
#print("Cross entropy using holdout 90% training: ", np.mean(cross_val_score(pipeline, df, y_encoded, scoring="accuracy", cv=ssH)))


cross entropy multi-class:  0.2509773260359656


Classifiers perform better than dummy classifiers!

### Lets compare these results compare to how it would have done with minimal data cleaning
I will use the classifier that performed best and assume the trend is the same when the dataset is not dirty.

In [98]:
df = pd.read_csv('dataset_statements.tsv', sep='\t')

In [99]:
df = df.take(np.random.permutation(len(df)))

In [100]:
df.drop('id', axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

In [101]:
df.describe(include= "all")

Unnamed: 0,label,statement,subject,speaker,job,state,party,num_barely_trues,num_falses,num_half_trues,num_mostly_trues,num_pants_fires,location
count,12791,12791,12789,12789,9224,10042,12789,12789.0,12789.0,12789.0,12789.0,12789.0,12660
unique,6,12765,4534,3309,1355,85,24,,,,,,5142
top,half-true,On a cap-and-trade plan.,health-care,barack-obama,President,Texas,republican,,,,,,a news release
freq,2627,3,474,611,615,1260,5665,,,,,,309
mean,,,,,,,,11.583939,13.359059,17.185785,16.49785,6.251388,
std,,,,,,,,18.978037,24.140086,35.847678,36.165276,16.180777,
min,,,,,,,,0.0,0.0,0.0,0.0,0.0,
25%,,,,,,,,0.0,0.0,0.0,0.0,0.0,
50%,,,,,,,,2.0,2.0,3.0,3.0,1.0,
75%,,,,,,,,12.0,15.0,13.0,12.0,5.0,


In [102]:
df.dropna(subset=["location"], inplace=True)
df.reset_index(drop=True, inplace=True)

In [103]:
def makeFeaturesStrings(df):
    df['location'] = df['location'].astype(str)
    df['subject'] = df['subject'].astype(str)
    df['speaker'] = df['speaker'].astype(str)
    df['party'] = df['party'].astype(str)
    df['job'] = df['job'].astype(str)
    df['state'] = df['state'].astype(str)
    df.reset_index(drop=True, inplace=True)
    return df

In [104]:
df = makeFeaturesStrings(df)

In [105]:
#encode the labels
y = df["label"].values

encoder = LabelEncoder()

y_encoded = encoder.fit_transform(y)

In [106]:
nominal_features = ["location", "job", "party", "speaker", "subject"]

pipeline = Pipeline([
        ("selector", DataFrameSelector(nominal_features)), 
        ("binarizer", FeatureBinarizer([df[feature].unique() for feature in nominal_features])), 
        ("imputer", MissingValueImputer(missing_values="NaN", strategy="most_frequent")),
        ("estimator", LogisticRegression()) 
])

nominal_pipeline_with_CCA = Pipeline([
        ("selector", DataFrameSelector(nominal_features)),
        ("binarizer", FeatureBinarizer([df[feature].unique() for feature in nominal_features])),
        ("cca", CCA(n_components=1)),
        ("estimator", LogisticRegression())
])


In [107]:
print("One-versus-Rest: ", np.mean(cross_val_score(pipeline, df, y_encoded, scoring="accuracy", cv=ss)))

One-versus-Rest:  0.25908372827804105


In [None]:
print("One-versus-Rest with CCA: ", np.mean(cross_val_score(nominal_pipeline_with_CCA, df, y_encoded, scoring="accuracy", cv=ss)))

The second classifier works slightly better I was better off not cleaning the data so thoroughly. 