# Final Project
## 1. Read the datasets

In [1]:
import pandas as pd
import numpy as np
np.random.seed(30224)

In [2]:
# Read train1.csv and train2.csv
train1 = pd.read_csv("./../data/train1.csv", header=0)
train2 = pd.read_csv("./../data/train2.csv", header=0)

In [3]:
train1.head(5)

Unnamed: 0,headline,clickbait
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1


In [4]:
train2.head(5)

Unnamed: 0,label,title
0,news,China and Economic Reform: Xi Jinping’s Track ...
1,news,Trade to Be a Big Topic in Theresa May’s U.S. ...
2,clickbait,"The Top Beaches In The World, According To Nat..."
3,clickbait,Sheriff’s Report Provides New Details on Tamir...
4,news,Surgeon claiming he will transplant volunteer'...


In [5]:
# Check for missing data
train1[train1.isna().any(axis=1)]

Unnamed: 0,headline,clickbait


In [6]:
train2[train2.isna().any(axis=1)]

Unnamed: 0,label,title


In [7]:
# Combine datasets
train2["clickbait"] = np.where(train2["label"] == "clickbait", 1, 0)
train2["headline"] = train2["title"]

df = pd.concat([train1, train2], join="inner")

## 2. Exploratory data analysis

In [8]:
# Show number of clickbait vs non-clickbait titles in the dataset
df["clickbait"].value_counts()

0    32739
1    20290
Name: clickbait, dtype: int64

In [9]:
# Tokenize of news headlines
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
nltk.download('punkt')
nltk.download('stopwords')

stop_tokens = set(stopwords.words("english")).union(set(string.punctuation))

def tokenize(sentence: str):
    tokens = word_tokenize(sentence)
    tokens = [token.lower() for token in tokens if token not in stop_tokens]
    return tokens

df["headline_tokens"] = df["headline"].apply(tokenize)

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# Compare title, word lengths of clickbait vs non-clickbait headlines
df

Unnamed: 0,headline,clickbait,headline_tokens
0,Should I Get Bings,1,"[should, i, get, bings]"
1,Which TV Female Friend Group Do You Belong In,1,"[which, tv, female, friend, group, do, you, be..."
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1,"[the, new, ``, star, wars, the, force, awakens..."
3,"This Vine Of New York On ""Celebrity Big Brothe...",1,"[this, vine, of, new, york, on, ``, celebrity,..."
4,A Couple Did A Stunning Photo Shoot With Their...,1,"[a, couple, did, a, stunning, photo, shoot, wi..."
...,...,...,...
21024,The New Playboy and the End of Adulthood,0,"[the, new, playboy, end, adulthood]"
21025,Use of weaponized drones by ISIS spurs terrori...,0,"[use, weaponized, drones, isis, spurs, terrori..."
21026,Experts: China might be building database of f...,0,"[experts, china, might, building, database, fe..."
21027,How Trump Can Fix His Troubled White House,0,"[how, trump, can, fix, his, troubled, white, h..."


In [11]:
# Compare word cloud of clickbait vs non-clickbait headlines