In [1]:
import gzip
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')
from sklearn.model_selection import train_test_split as tts
from nltk.tokenize import MWETokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
mwe = MWETokenizer()
lemma = WordNetLemmatizer()
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from tensorflow.python.keras.layers import  Dropout, Dense
from tensorflow.python.keras.models import Sequential
from tensorflow.keras.optimizers import Adam


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
clickdata=gzip.open("clickbait_data.gz", "rb")
nonclickdata=gzip.open("non_clickbait_data.gz", "rb")
click_df=pd.read_csv(clickdata, delimiter="\n", header=None)
nonclick_df=pd.read_csv(nonclickdata, delimiter="\n", header=None)

In [3]:
print(click_df.shape)
click_df.head()

(15999, 1)


Unnamed: 0,0
0,Should I Get Bings
1,Which TV Female Friend Group Do You Belong In
2,"The New ""Star Wars: The Force Awakens"" Trailer..."
3,"This Vine Of New York On ""Celebrity Big Brothe..."
4,A Couple Did A Stunning Photo Shoot With Their...


In [4]:
print(nonclick_df.shape)
nonclick_df.head()

(16001, 1)


Unnamed: 0,0
0,Bill Changing Credit Card Rules Is Sent to Oba...
1,"In Hollywood, the Easy-Money Generation Toughe..."
2,1700 runners still unaccounted for in UK's Lak...
3,Yankees Pitchers Trade Fielding Drills for Put...
4,Large earthquake rattles Indonesia; Seventh in...


In [5]:
click_df["Clickbait"] = 1
nonclick_df["Clickbait"] = 0

In [6]:
df = click_df.append(nonclick_df, ignore_index=True)
print(df.shape)

(32000, 2)


In [7]:
df.columns = ['news','Clickbait']

In [8]:
def remove_stopwords(text):
  token = text.split()
  return ' '.join([w for w in token if not w in english_stopwords])

In [9]:
def tokenize(text):
  return mwe.tokenize(text)

In [10]:
def lemmatize(text):
  return ''.join([lemma.lemmatize(word,'v') for word in mwe.tokenize(text)])

In [11]:
def shortwords(text):
  text = ' '.join([w for w in text.split() if len(w)>2])
  return text

In [12]:
def remove_punctuation(text):
  text = [i for i in text if i.isalpha() or i.isspace()]
  return ''.join(text)

In [13]:
def preprocess(s):
  s = s.lower()
  text = remove_stopwords(s)
  text = shortwords(text)
  text = remove_punctuation(text)
  text = tokenize(text)
  text = lemmatize(text)
  
  return text

In [14]:
df.news = df.news.apply(preprocess)

In [15]:
df.head()

Unnamed: 0,news,Clickbait
0,get bings,1
1,female friend group belong,1
2,new star wars force awakens trailer give chills,1
3,vine new york celebrity big brother fucking pe...,1
4,couple stunning photo shoot baby learning inop...,1


In [16]:
vectorizer = TfidfVectorizer()
X = df.news
y = df.Clickbait

In [17]:
X = vectorizer.fit_transform(X)

In [18]:
features = vectorizer.get_feature_names()

In [19]:
dense = X.todense().tolist()

In [20]:
new_df = pd.DataFrame(dense, columns=features)

In [21]:
final_df = pd.concat([new_df,y],axis=1,sort=False)
final_df.head()

Unnamed: 0,aaa,aaevpc,aap,aaron,aarons,ab,abandon,abandoned,abandoning,abandons,...,złoty,ºf,ángel,íngrid,íslands,îledefrance,ürümqi,śrī,šibenik,Clickbait
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [22]:
X1 = final_df.drop(columns="Clickbait")
y1 = final_df["Clickbait"]

In [23]:
del new_df,dense,X,y, final_df

In [24]:
X1train,X1test,y1train,y1test = tts(X1,y1,test_size = 0.3,stratify=y1, random_state=54)

In [25]:
X1train = X1train.to_numpy()
X1test = X1test.to_numpy()
y1train = y1train.to_numpy()
y1test = y1test.to_numpy()

In [26]:
print(X1train.shape)

(22400, 23840)


In [27]:
model = Sequential()
model.add(Dense(25,input_shape=(X1train.shape[1],),activation='relu'))
model.add(Dropout(0.6))
model.add(Dense(10,activation='relu'))
model.add(Dropout(0.6))
model.add(Dense(10,activation='relu'))
model.add(Dropout(0.6))
model.add(Dense(10,activation='relu'))
model.add(Dropout(0.6))
model.add(Dense(10,activation='relu'))
model.add(Dropout(0.6))
model.add(Dense(2, activation='sigmoid'))
np.random.seed(5)

In [28]:
opt=Adam(lr=0.001)
model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])


In [29]:
model.fit(X1train, y1train,
          validation_split=0.3,
          epochs = 10,
          batch_size = 120)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fd2235a9bb0>

In [30]:
score=model.evaluate(X1test, y1test)



In [31]:
pred=model.predict_classes(X1test)
matr=confusion_matrix(y1test, pred)
matr

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


array([[4359,  441],
       [ 155, 4645]])

In [32]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 25)                596025    
_________________________________________________________________
dropout (Dropout)            (None, 25)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                260       
_________________________________________________________________
dropout_1 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                110       
_________________________________________________________________
dropout_2 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                1

In [33]:
model.save("best_model.h5")

In [35]:
model.save_weights('best_weights')