In [1]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [2]:
from tensorflow.keras.preprocessing.text import one_hot

In [3]:
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv(r'twitter.csv')
df

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


In [5]:
def preprocess2(tweet, new_col):
    def remove_pattern(input_txt, pattern):
        r = re.findall(pattern, input_txt)
        for word in r:
            input_txt = re.sub(word, "", input_txt)
        return input_txt
    df[new_col] = np.vectorize(remove_pattern)(tweet, "@[\w]*")
    df[new_col] = df[new_col].str.replace("[^a-zA-Z#]", " ")
    df[new_col] = df[new_col].apply(lambda x: " ".join([w for w in x.split() if len(w)>3]))
    tokenized_tweet = df[new_col].apply(lambda x: x.split())
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()

    tokenized_tweet = tokenized_tweet.apply(lambda sentence: [stemmer.stem(word) for word in sentence])
    for i in range(len(tokenized_tweet)):
        tokenized_tweet[i] = " ".join(tokenized_tweet[i])
    
    df[new_col] = tokenized_tweet
    return df[new_col]

In [6]:
 preprocess2(df['tweet'],'clean_tweet2')

0        when father dysfunct selfish drag kid into dys...
1        thank #lyft credit caus they offer wheelchair ...
2                                      bihday your majesti
3                               #model love take with time
4                                 factsguid societi #motiv
                               ...                        
31957                                           that youuu
31958    nina turner airwav tri wrap herself mantl genu...
31959                         listen song monday morn work
31960          #sikh #templ vandalis #calgari #wso condemn
31961                                         thank follow
Name: clean_tweet2, Length: 31962, dtype: object

In [7]:
voc_size=100000

In [8]:
onehot_repr=[one_hot(tweet,voc_size)for tweet in df['tweet']] 
#print(onehot_repr)

In [9]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [10]:
## pre padding
sent_length=100
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[    0     0     0 ... 67021 96542 51362]
 [    0     0     0 ... 50745 96897 12920]
 [    0     0     0 ... 85344 64917 93801]
 ...
 [    0     0     0 ... 85937   894 87161]
 [    0     0     0 ... 58786 16163 12578]
 [    0     0     0 ... 68348 95796 17660]]


In [11]:
df.label.unique()

array([0, 1], dtype=int64)

In [12]:
x_final = np.array(embedded_docs)
y_final = np.array(df['label'])
x_train,x_test,y_train,y_test = train_test_split(x_final,y_final,test_size=0.33, random_state=42)

In [13]:
sent_length=100
embedding_vector_features=40 ##features representation
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [14]:
model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2016cf4f610>

In [15]:
from sklearn.metrics import classification_report
pred = model.predict(x_test)

In [16]:
y_pred = np.where(pred>0.5,1,0)
y_pred

array([[0],
       [0],
       [0],
       ...,
       [1],
       [0],
       [0]])

In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9542093287827076

In [18]:
text = "its unbelievable that in the 21st century we'd need something like this. again. #neverump  #xenophobia "
onehot_repr=one_hot(text,voc_size)
embedded_doc=pad_sequences([onehot_repr],padding='pre',maxlen=sent_length)
if model.predict(embedded_doc) >0.5:
    print('negative tweet')
else:
    print('positive tweet')

positive tweet


In [19]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[9623,  183],
       [ 300,  442]], dtype=int64)

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9542093287827076

In [21]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      9806
           1       0.71      0.60      0.65       742

    accuracy                           0.95     10548
   macro avg       0.84      0.79      0.81     10548
weighted avg       0.95      0.95      0.95     10548



In [22]:
acc= model.history['accuracy']
val_acc =model.history['val_accuracy']
loss = model.history['loss']
val_loss =model.history['val_loss']

TypeError: 'History' object is not subscriptable

In [None]:
plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(range(10), acc, label='Training Accuracy')
plt.plot(range(10), val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(range(10), loss, label='Training Loss')
plt.plot(range(10), val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()