In [1]:
import pandas as pd
import numpy as np
from sklearn import feature_extraction
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk import collocations
import re, string
import matplotlib
import matplotlib.pyplot as plt
from sklearn import preprocessing
from scipy import sparse
from sklearn import linear_model
plt.rcParams['figure.figsize'] = [16, 6]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mdleiton/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mdleiton/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def getxy(row_s, row_f, feature_cols=['content',"sin_stopwords", 'followers', 'following', 'retweet',"n_mentioned","n_hashtags"], label_col=['troll']):
    return df[feature_cols][row_s:row_f], df[label_col][row_s:row_f]

In [3]:
df = pd.read_csv('alldataset_sentimental.csv', sep=",", encoding='utf-8')
users_copy = df[df["troll"] == False].copy()
trolls_copy =  df[df["troll"] == True].copy()

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df.sample(5).head()

Unnamed: 0,author,content,followers,following,retweet,created_at,reply_to,n_mentioned,n_hashtags,user_mentioned,hashtags,troll,emoticones,sin_stopwords,polarity
378147,grillo_marino,@leinmir @MadeleineOster3 @Politico_pe El trab...,21,21,False,2019-08-07 23:32:38,leinmir,3.0,0,leinmir;MadeleineOster3;Politico_pe,,False,,trabajo congresistas miden solo cuantas leyes ...,0.0
173519,pretty_juliette,RT @Jaimefmacias: Hoy tenemos doblete con @Jos...,941,1618,True,2016-09-06 16:32:26,,2.0,0,Jaimefmacias;JoseBauz,,False,,hoy doblete chile bolivia peru ecuador,0.0
341692,GinoMayCry,RT @historyinmoment: Shanghai; China - 1990 vs...,961,1209,True,2019-08-20 01:44:12,,1.0,0,historyinmoment,,False,,shanghai china,0.0
197839,JonathanJAST23,Hakuna Matata!! 😎 @ Ciudadela Las Orquidias ht...,36,593,False,2016-10-16 06:54:33,,0.0,0,,,False,😎,hakuna matata ciudadela orquidias,0.0
365518,LaOrtecho,Quiero bajarme en Sáenz Peña; sentarme en la b...,165,338,False,2019-04-29 12:56:32,,0.0,0,,,False,,quiero bajarme sáenz peña sentarme banca frent...,0.0


In [5]:
df.shape

(395149, 15)

In [6]:
df = df[['content', 'followers', 'following', 'retweet',"n_mentioned","troll",'sin_stopwords',"n_hashtags"]].dropna()

In [7]:
df.isnull().values.any()

False

In [8]:
df.shape

(374753, 8)

In [9]:
df['retweet'].value_counts()

False    183555
True     127114
True      38653
False     25431
Name: retweet, dtype: int64

In [10]:
df['troll'].value_counts()

False    210906
True     163847
Name: troll, dtype: int64

In [11]:
df = df.sample(frac=1)

In [12]:
X_train, y_train = getxy(0,299802)

In [13]:
X_test, y_test = getxy(299803,316119)

In [14]:
X_train.shape, y_train.shape

((299802, 7), (299802, 1))

In [15]:
X_test.shape, y_test.shape

((16316, 7), (16316, 1))

### Tokenize words

In [18]:
stopwords_set = stopwords.words('spanish')
stopwords_set.extend(stopwords.words('english'))
stopwords_set = set(stopwords_set)
vocab_size=5000
tokenizer=feature_extraction.text.CountVectorizer(stop_words=stopwords_set, max_features=vocab_size)
tokenizer=tokenizer.fit(df['sin_stopwords'])

In [19]:
X_train_tok=tokenizer.transform(X_train['content'])
X_test_tok=tokenizer.transform(X_test['content'])

### Standarize followers/following

In [20]:
scaler = preprocessing.StandardScaler().fit(X_train[['followers','following',"n_mentioned","n_hashtags"]])

In [21]:
'rand means and scales: {}, {}'.format(scaler.mean_, scaler.scale_)

'rand means and scales: [5.35610287e+03 9.02373210e+02 1.34189232e+00 3.60728081e-01], [3.01633146e+04 2.76923953e+03 1.73039967e+00 1.20243866e+00]'

In [22]:
col_to_std = ['followers', 'following',"n_mentioned","n_hashtags"]
X_train[col_to_std]=scaler.transform(X_train[col_to_std])
X_test[col_to_std]=scaler.transform(X_test[col_to_std])

In [23]:
X_train[col_to_std].head()

Unnamed: 0,followers,following,n_mentioned,n_hashtags
225529,-0.17336,-0.267717,-0.19758,6.353149
342112,-0.14571,0.110726,0.380321,-0.299997
143463,-0.176509,-0.221134,-0.19758,-0.299997
320705,-0.169547,-0.313939,-0.19758,-0.299997
33653,-0.082455,0.887112,-0.775481,-0.299997


### Binarize outcome

In [24]:
y_train.head()

Unnamed: 0,troll
225529,False
342112,False
143463,True
320705,False
33653,True


In [25]:
bool_to_bin = lambda x: 1 if x else 0
y_train['troll'] = y_train['troll'].apply(bool_to_bin)
y_test['troll'] = y_test['troll'].apply(bool_to_bin)

# binarize retweet colum
X_train['retweet'] = X_train['retweet'].apply(bool_to_bin)
X_test['retweet'] = X_test['retweet'].apply(bool_to_bin)

### Concatenate features

In [26]:
def concatenate_features(tok_matrix, data_df):
    """ concatenate the tokenized matrix (scipy.sparse) with other features """
    sparse_cols = sparse.csr_matrix(data_df[['followers', 'following', 'retweet',"n_mentioned","n_hashtags"]])
    combined = sparse.hstack([tok_matrix, sparse_cols])
    return combined

In [27]:
X_train_combined = concatenate_features(X_train_tok, X_train)
X_test_combined = concatenate_features(X_test_tok, X_test)

In [28]:
X_train_combined.shape

(299802, 5005)

## Training the model - Logistic Regresion

In [229]:
logic_model = linear_model.LogisticRegression().fit(X_train_combined, y_train['troll'])



In [230]:
logic_model.score(X_train_combined, y_train['troll'])

0.855194428322693

In [231]:
logic_model.score(X_test_combined, y_test['troll'])

0.8494729100269673

## Creating Neural Net

In [29]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

Using TensorFlow backend.


In [30]:
X_train_combined.shape[1:]

(5005,)

In [31]:
model = Sequential()  # 5003 inputs
model.add(Dense(1024, activation='relu', input_shape=X_train_combined.shape[1:])) # first layer
model.add(Dropout(0.7))
model.add(Dense(512, activation='relu'))  # second layer 1024 inputs
model.add(Dropout(0.7))
model.add(Dense(128, activation='relu'))  # third layer  512 inputs
model.add(Dense(1, activation='sigmoid'))  # last layer





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [32]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1024)              5126144   
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               65664     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total params: 5,716,737
Trainable params: 5,716,737
Non-trainable params: 0
____________________________________________

In [33]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [34]:
# converting test scipy.sparse matrix to numpy
X_test_matrix = X_test_combined.todense()
X_train_matrix = X_train_combined.todense()

In [35]:
score = model.evaluate(X_test_matrix, y_test['troll'], verbose=0)
accuracy = 100*score[1]

print('Precisión en el conjunto de prueba: %.4f%%' % accuracy)

Precisión en el conjunto de prueba: 44.6127%


In [36]:
from keras.callbacks import ModelCheckpoint

checkpointer = ModelCheckpoint(filepath='trolls.model.best.hdf5', verbose=1, save_best_only=True)
hist = model.fit(X_train_matrix, y_train['troll'], batch_size=1024, epochs=30, validation_split=0.2, callbacks=[checkpointer], verbose=1, shuffle=True)

Train on 239841 samples, validate on 59961 samples
Epoch 1/30

Epoch 00001: val_loss improved from inf to 0.31366, saving model to trolls.model.best.hdf5
Epoch 2/30

Epoch 00002: val_loss improved from 0.31366 to 0.29990, saving model to trolls.model.best.hdf5
Epoch 3/30

Epoch 00003: val_loss improved from 0.29990 to 0.29643, saving model to trolls.model.best.hdf5
Epoch 4/30

Epoch 00004: val_loss did not improve from 0.29643
Epoch 5/30

Epoch 00005: val_loss did not improve from 0.29643
Epoch 6/30

Epoch 00006: val_loss did not improve from 0.29643
Epoch 7/30

Epoch 00007: val_loss did not improve from 0.29643
Epoch 8/30

Epoch 00008: val_loss did not improve from 0.29643
Epoch 9/30

Epoch 00009: val_loss did not improve from 0.29643
Epoch 10/30

Epoch 00010: val_loss did not improve from 0.29643
Epoch 11/30

Epoch 00011: val_loss did not improve from 0.29643
Epoch 12/30

Epoch 00012: val_loss did not improve from 0.29643
Epoch 13/30

Epoch 00013: val_loss did not improve from 0.2964

In [37]:
model.load_weights('trolls.model.best.hdf5')
score = model.evaluate(X_test_matrix, y_test['troll'], verbose=0)
accuracy = 100*score[1]

# mostrar la precisión en prubea
print('Precisión durante la prueba: %.4f%%' % accuracy)
# save model and weight
algorithm_name = "LSTM"
model.save("model" + algorithm_name + ".h5")
model_json = model.to_json()
with open("model" + algorithm_name + ".json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("model" + algorithm_name + ".h5")

Precisión durante la prueba: 87.5766%


In [40]:
X_train_matrix.shape

(299802, 5005)