In [57]:
import pandas as pd 
import glob
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [58]:
vectorizer = TfidfVectorizer()

In [59]:
L = []
for csv in glob.glob('*.csv'):
    df1 = pd.read_csv(csv, encoding='latin-1')
    L.append(df1)
    
df = pd.concat(L, axis = 0, sort=False)

# Data Cleaning

In [60]:
df['Euroskepticism_Score'].unique()

array([0.0, 0.5, 1.0, '0.5', '0', '1', nan, 'o', '0?'], dtype=object)

In [61]:
df['Nationalism_Score'].unique()

array([0.5, 0. , 1. , nan])

In [62]:
df['Populism_Score'].unique()

array([0.0, 0.5,
       ' Gaddafi attacked the French for ???interfering in his business?? ',
       '0', '0.5', '1',
       'EU must take appropriate and swift action to protect and guard its aquaculture against such unfair competition.',
       1.0, nan], dtype=object)

In [63]:
df['Question ID'].nunique()

2500

In [None]:
irreg_populism_scores = [' Gaddafi attacked the French for ???interfering in his business?? ',
    'EU must take appropriate and swift action to protect and guard its aquaculture against such unfair competition.']

irreg_euro_scores = ['o', '0?']

df = df[~df['Populism_Score'].isin(irreg_populism_scores) & ~df['Euroskepticism_Score'].isin(irreg_euro_scores)]

df['Euroskepticism_Score'] = df['Euroskepticism_Score'].astype('float')
df['Nationalism_Score'] = df['Nationalism_Score'].astype('float')
df['Populism_Score'] = df['Populism_Score'].astype('float')
df['Question Text'] = df['Question Text'].astype('str')

In [4]:
max_of_scores = dict()

for question_id in df['Question ID'].unique():
    a = np.max(df[(df['Question ID'] == question_id)]['Euroskepticism_Score'].fillna(0))
    b = np.max(df[(df['Question ID'] == question_id)]['Nationalism_Score'].fillna(0))
    c = np.max(df[(df['Question ID'] == question_id)]['Populism_Score'].fillna(0))
    T = df[(df['Question ID'] == question_id)]['Question Text']
    max_of_scores[question_id] = [T[T.index[0]], a, b, c]

scores = pd.DataFrame.from_dict(max_of_scores, orient = 'index')
scores = scores.rename(columns = {0: 'Question Text', 1: 'Euroskepticism_Score', 
                                        2: 'Nationalism_Score', 3: 'Populism_Score'})
scores.index = range(len(scores))

scores['Question Text'] = scores['Question Text'].apply(lambda x: str(x))
for score_type in ['Euroskepticism_Score','Nationalism_Score', 'Populism_Score']:
    scores[score_type] = scores[score_type].astype('float')
    scores[score_type] = scores[score_type].apply(np.ceil)

In [65]:
print(scores['Euroskepticism_Score'].unique())
print(scores['Nationalism_Score'].unique())
print(scores['Populism_Score'].unique())

[0. 1.]
[1. 0.]
[0. 1.]


# Ensemble Methods

In [9]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import matthews_corrcoef

##### Euroskepticism

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, scores['Euroskepticism_Score'], test_size = .3)
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)

In [39]:
print("accuracy: {:.4f} \n".format(accuracy_score(y_test, y_pred)))
print("confusion matrix: \n",  confusion_matrix(y_test, y_pred), "\n")
print("matthews_corrcoef:  {:.4f}".format(matthews_corrcoef(y_test, y_pred)))

accuracy: 0.9333 

confusion matrix: 
 [[700  10]
 [ 40   0]] 

matthews_corrcoef:  -0.0276


##### Nationalism

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, scores['Nationalism_Score'], test_size = .3)
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)

In [41]:
print("accuracy: {:.4f} \n".format(accuracy_score(y_test, y_pred)))
print("confusion matrix: \n",  confusion_matrix(y_test, y_pred), "\n")
print("matthews_corrcoef:  {:.4f}".format(matthews_corrcoef(y_test, y_pred)))

accuracy: 0.9453 

confusion matrix: 
 [[709   4]
 [ 37   0]] 

matthews_corrcoef:  -0.0167


##### Populism

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, scores['Populism_Score'], test_size = .3)
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)

In [43]:
print("accuracy: {:.4f} \n".format(accuracy_score(y_test, y_pred)))
print("confusion matrix: \n",  confusion_matrix(y_test, y_pred), "\n")
print("matthews_corrcoef:  {:.4f}".format(matthews_corrcoef(y_test, y_pred)))

accuracy: 0.9427 

confusion matrix: 
 [[707   3]
 [ 40   0]] 

matthews_corrcoef:  -0.0150


In [45]:
X = vectorizer.fit_transform(scores['Question Text'])

# 1D CNN for text classification with Keras

In [46]:
import keras
from keras.layers import LSTM
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb

In [47]:
X

<2500x18777 sparse matrix of type '<class 'numpy.float64'>'
	with 222133 stored elements in Compressed Sparse Row format>

In [48]:
x_train, x_test, y_train, y_test = train_test_split(X, scores['Populism_Score'], test_size = .3)

In [49]:
max_features = 5000 #limit vocab 
maxlen = 400 #word sequence length
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3 #filter dim
hidden_dims = 250
epochs = 6


print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=X.shape[1]))
model.add(Dropout(0.2))

# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))

model.add(GlobalMaxPooling1D())

model.add(Dense(hidden_dims))
model.add(Dropout(0.2))  #fraction of neurons to drop
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

Build model...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1750 samples, validate on 750 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15

KeyboardInterrupt: 

# LSTM with Keras 

In [39]:
# set parameters:
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
# epochs = 15

In [40]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', #different loss function may be needed
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=2,
          validation_data=(x_test, y_test))
score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Build model...
Train...
Train on 1750 samples, validate on 750 samples
Epoch 1/2
Epoch 2/2
Test score: 0.1556070116609335
Test accuracy: 0.964
