In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [None]:
!ls drive/My\ Drive/ML-work/NLP/predict-news-sentiment/

data  glove.6B.50d.txt	glove.6B.50d.txt.zip


In [None]:
%cd drive/My\ Drive/ML-work/NLP/predict-news-sentiment/

/content/drive/My Drive/ML-work/NLP/predict-news-sentiment


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_curve

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer #word stemmer class
lemma = WordNetLemmatizer()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Bidirectional, LSTM, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
!unzip -q "./glove.6B.50d.txt.zip"

In [None]:
embeddings_index = dict()
f = open('./glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [None]:
train.head()

Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline
0,Tr3CMgRv1N,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemete...,USA TODAY,obama,2002-04-02 00:00:00,-1,-1,-1,0.0,-0.0533
1,Wc81vGp8qZ,A Look at the Health of the Chinese Economy,"Tim Haywood, investment director business-unit...",Bloomberg,economy,2008-09-20 00:00:00,-1,-1,-1,0.208333,-0.156386
2,zNGH03CrZH,Nouriel Roubini: Global Economy Not Back to 2008,"Nouriel Roubini, NYU professor and chairman at...",Bloomberg,economy,2012-01-28 00:00:00,-1,-1,-1,-0.42521,0.139754
3,3sM1H0W8ts,Finland GDP Expands In Q4,Finland's economy expanded marginally in the t...,RTT News,economy,2015-03-01 00:06:00,-1,-1,-1,0.0,0.026064
4,wUbnxgvqaZ,"Tourism, govt spending buoys Thai economy in J...",Tourism and public spending continued to boost...,The Nation - Thailand&#39;s English news,economy,2015-03-01 00:11:00,-1,-1,-1,0.0,0.141084


In [None]:
missing_val = pd.DataFrame(train.isnull().sum())

missing_val = missing_val.reset_index()
missing_val

Unnamed: 0,index,0
0,IDLink,0
1,Title,0
2,Headline,0
3,Source,175
4,Topic,0
5,PublishDate,0
6,Facebook,0
7,GooglePlus,0
8,LinkedIn,0
9,SentimentTitle,0


In [None]:
train.dropna(inplace=True)

In [None]:
test.isnull().sum()

IDLink           0
Title            0
Headline         0
Source         101
Topic            0
PublishDate      0
Facebook         0
GooglePlus       0
LinkedIn         0
dtype: int64

In [None]:
test['Source'].fillna(value=test['Source'].mean(), inplace=True)

TypeError: ignored

In [None]:
test.describe()

Unnamed: 0,Facebook,GooglePlus,LinkedIn
count,37187.0,37187.0,37187.0
mean,85.063033,2.903837,19.982843
std,420.610253,13.556345,225.746903
min,-1.0,-1.0,-1.0
25%,0.0,0.0,0.0
50%,4.0,0.0,0.0
75%,28.0,2.0,4.0
max,16598.0,1016.0,20341.0


In [None]:
X_train_title = train.loc[:,'Title'].values
y_train_title = train.loc[:,['SentimentTitle']].values

X_train_headline = train.loc[:,'Headline'].values
y_train_headline = train.loc[:,['SentimentHeadline']].values

In [None]:
X_test_title = test.loc[:,'Title'].values
X_test_headline = test.loc[:,'Headline'].values

In [None]:
title_df=pd.DataFrame()
title_df['X_train_title']=X_train_title
title_df['y_train_title']=y_train_title

headline_df=pd.DataFrame()
headline_df['X_train_headline']=X_train_headline
headline_df['y_train_headline']=y_train_headline

test_df=pd.DataFrame()
test_df['X_test_title']=X_test_title
test_df['X_test_headline']=X_test_headline

In [None]:
def preprocess_text(texts):
    texts = texts.lower() 
    texts = re.sub(r'[^\x00-\x7F]+',' ', texts) 
    words = texts.split()
    words = filter(lambda x: x[0]!= '@' , texts.split()) 
    words = [word for word in words if word not in set(stopwords.words('english'))] 
    texts = " ".join(words)
    return texts

In [None]:
title_df['X_train_title'] = title_df.X_train_title.apply(preprocess_text)
display(title_df.head())

Unnamed: 0,X_train_title,y_train_title
0,obama lays wreath arlington national cemetery,0.0
1,look health chinese economy,0.208333
2,nouriel roubini: global economy back 2008,-0.42521
3,finland gdp expands q4,0.0
4,"tourism, govt spending buoys thai economy january",0.0


In [None]:
headline_df['X_train_headline'] = headline_df.X_train_headline.apply(preprocess_text)
display(headline_df.head())

Unnamed: 0,X_train_headline,y_train_headline
0,obama lays wreath arlington national cemetery....,-0.0533
1,"tim haywood, investment director business-unit...",-0.156386
2,"nouriel roubini, nyu professor chairman roubin...",0.139754
3,finland's economy expanded marginally three mo...,0.026064
4,tourism public spending continued boost econom...,0.141084


In [None]:
test_df['X_test_title'] = test_df.X_test_title.apply(preprocess_text)
test_df['X_test_headline'] = test_df.X_test_headline.apply(preprocess_text)
display(test_df.head())

Unnamed: 0,X_test_title,X_test_headline
0,sliding economy: fg fights back n3trn tsa funds,2016 budget passed national assembly n3trillio...
1,microsoft shows hololens bring distant family ...,recent microsoft research video shows $3000 au...
2,"microsoft twitter robot praises hitler, trump ...","* microsoft teamed bing create taytweets, acco..."
3,flood central bank moves can't get world econo...,central bankers managed steer world economy cl...
4,usd/jpy: bears lining mixed u.s. economy outlook,"however, streak seven-day gains might end mark..."


In [None]:
max_len_title = title_df.X_train_title.apply(lambda x: len(x.split())).max()

tok_title = Tokenizer()
tok_title.fit_on_texts(title_df.X_train_title)
vocab_size_title = len(tok_title.word_index) + 1
encoded_title = tok_title.texts_to_sequences(title_df.X_train_title)
padded_title = pad_sequences(encoded_title, maxlen=max_len_title, padding='post')

vocab_size_title = len(tok_title.word_index) + 1

In [None]:
title_embedding_matrix = np.zeros((vocab_size_title, 50))
for word, i in tok_title.word_index.items():
    t_embedding_vector = embeddings_index.get(word)
    if t_embedding_vector is not None:
        title_embedding_matrix[i] = t_embedding_vector

In [None]:
max_len_headline = headline_df.X_train_headline.apply(lambda x: len(x.split())).max()

tok_headline = Tokenizer()
tok_headline.fit_on_texts(headline_df.X_train_headline)
vocab_size_headline = len(tok_headline.word_index) + 1
encoded_headline = tok_headline.texts_to_sequences(headline_df.X_train_headline)
padded_headline = pad_sequences(encoded_headline, maxlen=max_len_headline, padding='post')

vocab_size_headline = len(tok_headline.word_index) + 1

In [None]:
headline_embedding_matrix = np.zeros((vocab_size_headline, 50))
for word, i in tok_headline.word_index.items():
    h_embedding_vector = embeddings_index.get(word)
    if h_embedding_vector is not None:
        headline_embedding_matrix[i] = h_embedding_vector

In [None]:
test_max_len_title = test_df.X_test_title.apply(lambda x: len(x.split())).max()

test_tok_title = Tokenizer()
test_tok_title.fit_on_texts(test_df.X_test_title)
test_vocab_size_title = len(test_tok_title.word_index) + 1
test_encoded_title = test_tok_title.texts_to_sequences(test_df.X_test_title)
test_padded_title = pad_sequences(test_encoded_title, maxlen=test_max_len_title, padding='post')

test_vocab_size_title = len(test_tok_title.word_index) + 1

In [None]:
test_max_len_headline = test_df.X_test_headline.apply(lambda x: len(x.split())).max()

test_tok_headline = Tokenizer()
test_tok_headline.fit_on_texts(test_df.X_test_headline)
test_vocab_size_headline = len(test_tok_headline.word_index) + 1
test_encoded_headline = test_tok_headline.texts_to_sequences(test_df.X_test_headline)
test_padded_headline = pad_sequences(test_encoded_headline, maxlen=test_max_len_headline, padding='post')

test_vocab_size_headline = len(test_tok_headline.word_index) + 1

In [None]:
x_train_title, x_valid_title, Y_train_title, y_valid_title = train_test_split(padded_title, y_train_title, shuffle = True, test_size = 0.1)
x_train_headline, x_valid_headline, Y_train_headline, y_valid_headline = train_test_split(padded_headline, y_train_headline, shuffle = True, test_size = 0.1)

In [None]:
import math
from math import exp
from keras import backend as K

In [None]:
def mod_tanh(x):
    return K.tanh(0.6*x)

### Title model

In [None]:
title_model = Sequential()
title_model.add(Embedding(vocab_size_title, 50, input_length=max_len_title, weights=[title_embedding_matrix], trainable=True))
title_model.add(Bidirectional(LSTM(20, return_sequences=True)))
title_model.add(Dropout(0.3))
title_model.add(BatchNormalization())
title_model.add(Bidirectional(LSTM(20, return_sequences=True)))
title_model.add(Dropout(0.3))
title_model.add(BatchNormalization())
title_model.add(Bidirectional(LSTM(20)))
title_model.add(Dropout(0.3))
title_model.add(BatchNormalization())
title_model.add(Dense(64, activation='relu'))
title_model.add(Dense(64, activation='relu'))
title_model.add(Dense(1, activation=mod_tanh))
title_model.compile(loss='mse', optimizer='adam', metrics=['mse', 'mae'])

### Headline model

In [None]:
headline_model = Sequential()
headline_model.add(Embedding(vocab_size_headline, 50, input_length=max_len_headline, weights=[headline_embedding_matrix], trainable=True))
headline_model.add(Bidirectional(LSTM(20, return_sequences=True)))
headline_model.add(Dropout(0.3))
headline_model.add(BatchNormalization())
headline_model.add(Bidirectional(LSTM(20, return_sequences=True)))
headline_model.add(Dropout(0.3))
headline_model.add(BatchNormalization())
headline_model.add(Bidirectional(LSTM(20)))
headline_model.add(Dropout(0.3))
headline_model.add(BatchNormalization())
headline_model.add(Dense(64, activation='relu'))
headline_model.add(Dense(64, activation='relu'))
headline_model.add(Dense(1, activation=mod_tanh))
headline_model.compile(loss='mse', optimizer='adam', metrics=['mse', 'mae'])

In [None]:
import tensorflow

In [None]:
with tensorflow.device('/device:GPU:0'):
    title_model.fit(x_train_title, Y_train_title, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
with tensorflow.device('/device:GPU:0'):
    headline_model.fit(x_train_headline, Y_train_headline, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
title_valid_pred = title_model.predict(x_valid_title)

In [None]:
headline_valid_pred = headline_model.predict(x_valid_headline)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
mae_title=mean_absolute_error(y_valid_title,title_valid_pred)

In [None]:
mae_headline=mean_absolute_error(y_valid_headline,headline_valid_pred)

In [None]:
score=1-((0.4*mae_title)+(0.6*mae_headline))

In [None]:
score

0.9320236916220008

In [None]:
pred_title=title_model.predict(test_padded_title)

In [None]:
pred_headline=headline_model.predict(test_padded_headline)



In [None]:
submission=pd.DataFrame()

In [None]:
submission['IDLink']=test['IDLink'].to_list()
submission['SentimentTitle']=pred_title
submission['SentimentHeadline']=pred_headline

In [None]:
submission.head()

Unnamed: 0,IDLink,SentimentTitle,SentimentHeadline
0,tFrqIR6Chj,0.172802,-0.000203
1,DVAaGErjlF,-0.094027,0.253928
2,OT9UIZm5M2,-0.001465,-0.084972
3,lflGp3q2Fj,-0.140512,0.170248
4,zDYG0SoovZ,-0.007781,0.01487


In [None]:
submission.to_csv('./data/submission2.csv')