Importing Packages

In [1]:
# Basic packages
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers

Using TensorFlow backend.


In [2]:
NB_WORDS = 10000  # Parameter indicating the number of words we'll put in the dictionary
VAL_SIZE = 1000  # Size of the validation set
NB_START_EPOCHS = 5  # Number of epochs we usually start to train with
BATCH_SIZE = 32  # Size of the batches used in the mini-batch gradient descent
df = pd.read_csv(r'C:\Users\saini\Documents\Projects\Twitter sentiment analysis\sentiment-analysis-of-tweets\train.txt')

In [3]:
df1 = df.drop(['tweet_id'], axis=1)

In [4]:
df1

Unnamed: 0,sentiment,tweet_text
0,positive,Gas by my house hit $3.39!!!! I\u2019m going t...
1,negative,Theo Walcott is still shit\u002c watch Rafa an...
2,negative,its not that I\u2019m a GSP fan\u002c i just h...
3,negative,Iranian general says Israel\u2019s Iron Dome c...
4,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...
...,...,...
21460,neutral,"the day after newark ill be able to say """"i me..."
21461,neutral,FEC hold farewell session for seven ministers ...
21462,neutral,Luca Di Montezemolo (who's last day was Monday...
21463,positive,Coffee is pretty much the answer to all questi...


# Preprocessing text
### 1] removing the stopwords in the tweets
### 2] removing the mentions in the tweets

In [5]:
def remove_stopwords(input_text):
    stopwords_list = stopwords.words('english')
        # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"]
    words = input_text.split() 
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
    return " ".join(clean_words) 
    
def remove_mentions(input_text):
    return re.sub(r'@\w+', '', input_text)

df1.tweet_text = df1.tweet_text.apply(remove_stopwords).apply(remove_mentions)

In [6]:
df1.tweet_text[4]

'Tehran\\u002c Mon Amour: Obama Tried Establish Ties Mullahs http://t.co/TZZzrrKa via  No Barack Obama Vote Mitt Romney'

In [7]:
X = df1.tweet_text
y = df1.sentiment

### Converting words to numbers

In [8]:
tk = Tokenizer(num_words=NB_WORDS,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True,
               split=" ")
tk.fit_on_texts(X) 

In [9]:
X_train_seq = tk.texts_to_sequences(X)

In [10]:
def one_hot_seq(seqs, nb_features = NB_WORDS): # converting the integers to one-hot encoded features.
    ohs = np.zeros((len(seqs), nb_features))
    for i, s in enumerate(seqs):
        ohs[i, s] = 1.
    return ohs
X_train_oh = one_hot_seq(X_train_seq)

In [11]:
X_train_oh[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [12]:
# converting target classes to numbers and converted to one-hot-encoded-form
le = LabelEncoder()  
y_train_le = le.fit_transform(y)
y_train_oh = to_categorical(y_train_le)

In [13]:
drop_model = models.Sequential()
drop_model.add(layers.Dense(64, activation='relu', input_shape=(NB_WORDS,)))
drop_model.add(layers.Dropout(0.5))
drop_model.add(layers.Dense(64, activation='relu'))
drop_model.add(layers.Dropout(0.5))
drop_model.add(layers.Dense(3, activation='softmax'))
drop_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                640064    
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 195       
Total params: 644,419
Trainable params: 644,419
Non-trainable params: 0
_________________________________________________________________


In [14]:
drop_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [15]:
drop_model.fit(X_train_oh, y_train_oh, epochs=NB_START_EPOCHS, batch_size=BATCH_SIZE, verbose=0)

<keras.callbacks.callbacks.History at 0x26198b68d08>

In [16]:
y_pred = drop_model.predict(X_train_oh)

In [17]:
max(y_pred[1])

0.8412701

In [18]:
y_pred[1]

array([0.8412701 , 0.13318643, 0.02554344], dtype=float32)

In [19]:
df

Unnamed: 0,tweet_id,sentiment,tweet_text
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...
...,...,...,...
21460,522949024132112384,neutral,"the day after newark ill be able to say """"i me..."
21461,522372593312350209,neutral,FEC hold farewell session for seven ministers ...
21462,522515200592052224,neutral,Luca Di Montezemolo (who's last day was Monday...
21463,523089087155437568,positive,Coffee is pretty much the answer to all questi...


In [20]:
y_train_le

array([2, 0, 0, ..., 1, 2, 1])

In [21]:
test_data = pd.read_csv(r'C:\Users\saini\Documents\Projects\Twitter sentiment analysis\sentiment-analysis-of-tweets\test_samples.txt')
test_data

Unnamed: 0,tweet_id,tweet_text
0,264238274963451904,"@jjuueellzz down in the Atlantic city, ventnor..."
1,218775148495515649,Musical awareness: Great Big Beautiful Tomorro...
2,258965201766998017,On Radio786 100.4fm 7:10 Fri Oct 19 Labour ana...
3,262926411352903682,"Kapan sih lo ngebuktiin,jan ngomong doang Susa..."
4,171874368908050432,"Excuse the connectivity of this live stream, f..."
...,...,...
5393,210378118865756160,It's a Wednesday girls night out as '90's band...
5394,245177521304399872,"night college course sorted, just have to enro..."
5395,259280987089932288,For the 1st time in 30 years. For your splendi...
5396,201113950211940352,NURSES DAY - 12 MAY 2012. Nursing: The heart b...


In [22]:
test_tweets = pd.DataFrame(test_data.tweet_text, columns=['tweet_text'])
test_tweets.tweet_text = test_tweets.tweet_text.apply(remove_stopwords).apply(remove_mentions)

In [23]:
test_tweets

Unnamed: 0,tweet_text
0,"Atlantic city, ventnor, margate, ocean city a..."
1,Musical awareness: Great Big Beautiful Tomorro...
2,On Radio786 100.4fm 7:10 Fri Oct 19 Labour ana...
3,"Kapan sih lo ngebuktiin,jan ngomong doang Susa..."
4,"Excuse connectivity live stream, Baba Amr, man..."
...,...
5393,It's Wednesday girls night '90's band Wilson P...
5394,"night college course sorted, enrole tomorrow n..."
5395,For 1st time 30 years. For splendiferous enter...
5396,NURSES DAY 12 MAY 2012. Nursing: The heart bea...


In [24]:
x_test = tk.texts_to_sequences(test_tweets.tweet_text)

In [25]:
x_test_oh = one_hot_seq(x_test)

In [26]:
test_prediction = drop_model.predict(x_test_oh)

In [27]:
arr1 = []
for i in range(len(test_prediction)):
    arr = test_prediction[i]
    max_elem = max(arr)
    n = np.where(arr == max_elem)
    arr1.append(int(n[0][0]))

In [28]:
test_sentiment = pd.DataFrame(arr1, columns=['sentiment'])
test_sentiment

Unnamed: 0,sentiment
0,2
1,2
2,1
3,0
4,1
...,...
5393,2
5394,1
5395,1
5396,1


In [29]:
re_encod = {2: 'positive', 0: 'negative', 1: 'neutral'}
test_sentiment.sentiment = test_sentiment.sentiment.apply(lambda x: re_encod[int(x)])

In [30]:
test_sentiment

Unnamed: 0,sentiment
0,positive
1,positive
2,neutral
3,negative
4,neutral
...,...
5393,positive
5394,neutral
5395,neutral
5396,neutral


In [31]:
test_data.drop(['tweet_text'], axis=1, inplace=True)

In [32]:
test_data['sentiment'] = test_sentiment

In [33]:
test_data

Unnamed: 0,tweet_id,sentiment
0,264238274963451904,positive
1,218775148495515649,positive
2,258965201766998017,neutral
3,262926411352903682,negative
4,171874368908050432,neutral
...,...,...
5393,210378118865756160,positive
5394,245177521304399872,neutral
5395,259280987089932288,neutral
5396,201113950211940352,neutral


In [34]:
import csv
test_data.to_csv(r'C:\Users\saini\Documents\Projects\Twitter sentiment analysis\sentiment-analysis-of-tweets\sentiment21.csv', index=False,  quoting=csv.QUOTE_NONE)