In [1]:
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

## Preprocessing
#### Data Cleaning and Joining Tables

In [2]:
# Loading in the data
article_summary = pd.read_csv('../data/clean_data/article_summary.csv')
image_df = pd.read_csv('../data/clean_data/image_summary.csv')

In [3]:
# Article dataframe
small_df = article_summary[['id', 'headline']]
small_df.columns = ['article_id', 'headline']
print(small_df.shape)
small_df.head()

(22602, 2)


Unnamed: 0,article_id,headline
0,964285bee74e430cb22441e59a968e76,Sheriff: NY man calls 911 to say he's violatin...
1,5485474faec244b0881838c139b4ef10,Champion Patriots open vs. Steelers; 5 interna...
2,6aff14aef2ff4a10a4f27f4ca879ad9e,"Tick tock, tick tock: Tokyo Olympics clock hit..."
3,3a29e90a364a4ce7bb294286a8647535,"American Airlines first sponsor for Rams, Char..."
4,bbabfcdbb53b4804adaffb2933bfc207,Off to a new life: 3 Albanian lions sent to Du...


In [4]:
# Image dataframe
image_df = image_df[['id', 'article_idx']]
print(image_df.shape)
image_df.columns = ['img_id', 'article_id']
image_df.head()

# Getting rid of duplicates
image_df = pd.DataFrame(image_df.groupby('img_id').agg('max')).reset_index()
print(image_df.shape)
image_df.head()

(78521, 2)
(78521, 2)


Unnamed: 0,img_id,article_id
0,0002c8b6322446a8a9b8f8abaccb4430,6932c6d6086543c4919b85286687e7f1
1,00036662b297400ca0c0819173d61efd,6a3784541f984889ac98770271a62c50
2,0004a620b93b4d55addfd4eba8ad79df,80bfa84a31cb449cb5a60e1a24d7f8a7
3,0006323cab994ddd9c0824d14c2146fd,bcb294c8c35a464798207c7e96d8d74e
4,00081aa6f45c467c8b44f7806412dbe5,63ad4a991b8f4a63ac05ffd8ccf1f97d


In [5]:
# Merging article and image databases
full_df = image_df.merge(small_df, on='article_id')
print(full_df.shape)
full_df.head(5)

(78521, 3)


Unnamed: 0,img_id,article_id,headline
0,0002c8b6322446a8a9b8f8abaccb4430,6932c6d6086543c4919b85286687e7f1,The Latest: Avenatti expects to be 'fully exon...
1,ec054dc8728b4c0bb5207519534a6c1d,6932c6d6086543c4919b85286687e7f1,The Latest: Avenatti expects to be 'fully exon...
2,f5580c3cb4c84dc4b89a981f00f90309,6932c6d6086543c4919b85286687e7f1,The Latest: Avenatti expects to be 'fully exon...
3,ffab88527d4944628bfc061d6ac6f6ae,6932c6d6086543c4919b85286687e7f1,The Latest: Avenatti expects to be 'fully exon...
4,00036662b297400ca0c0819173d61efd,6a3784541f984889ac98770271a62c50,Aging voters in Greece keep traditional campai...


In [6]:
# Replacing duplicate image ids
with open('../data/image_duplicates.txt', 'r') as f:
    replace_list = f.readlines()

replace_dict = {}
for elem in replace_list:
    elems = elem.strip().split(',')
    value = elems[0]
    for i in range(1, len(elems)):
        replace_dict[elems[i]] = value

full_df['img_id'] = [replace_dict[v] if v in replace_dict.keys() else v for v in full_df['img_id']]
print(full_df.shape)
full_df.head()

(78521, 3)


Unnamed: 0,img_id,article_id,headline
0,0002c8b6322446a8a9b8f8abaccb4430,6932c6d6086543c4919b85286687e7f1,The Latest: Avenatti expects to be 'fully exon...
1,ec054dc8728b4c0bb5207519534a6c1d,6932c6d6086543c4919b85286687e7f1,The Latest: Avenatti expects to be 'fully exon...
2,f5580c3cb4c84dc4b89a981f00f90309,6932c6d6086543c4919b85286687e7f1,The Latest: Avenatti expects to be 'fully exon...
3,ffab88527d4944628bfc061d6ac6f6ae,6932c6d6086543c4919b85286687e7f1,The Latest: Avenatti expects to be 'fully exon...
4,00036662b297400ca0c0819173d61efd,6a3784541f984889ac98770271a62c50,Aging voters in Greece keep traditional campai...


In [7]:
len(full_df['img_id'].unique())

78021

In [8]:
len(replace_dict)

501

#### Preprocessing

In [None]:
# Label encoding
le_img = LabelEncoder().fit(full_df['img_id'])
full_df['img_id'] = le_img.transform(full_df['img_id'])

le_art = LabelEncoder().fit(full_df['article_id'])
full_df['article_id'] = le_art.transform(full_df['article_id'])

print(full_df.shape)
full_df.head()

#### Train-Test Split

In [None]:
train_df = full_df[full_df['article_id'] < int(0.8*len(full_df['article_id'].unique()))]
test_df = full_df[full_df['article_id'] > int(0.8*len(full_df['article_id'].unique()))]
print(train_df.shape, test_df.shape)

#### Headline Representation using TF-IDF and Truncated SVD (for dimensionality reduction)

In [None]:
# TF-IDF vectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit(train_df['headline'])
transform_train = tfidf.transform(train_df['headline'])
transform_test = tfidf.transform(test_df['headline'])

# Truncated SVD
tsvd = TruncatedSVD(n_components=1000).fit(transform_train)
tsvd_transform_train = pd.DataFrame(tsvd.transform(transform_train))
tsvd_transform_test = pd.DataFrame(tsvd.transform(transform_test))

# Printing Shapes
print(tsvd_transform_train.shape, tsvd_transform_test.shape)

In [None]:
print(train_df.shape, tsvd_transform_train.shape)
print(test_df.shape, tsvd_transform_test.shape)

In [None]:
# Concatenating the TF-IDF matrix back to the dataframe
train_df_svd = pd.concat([train_df.reset_index(drop=True), tsvd_transform_train.reset_index(drop=True)], axis=1)
test_df_svd = pd.concat([test_df.reset_index(drop=True), tsvd_transform_test.reset_index(drop=True)], axis=1)

# Dropping the headline
train_df_svd = train_df_svd.drop(['headline', 'article_id'], axis=1)
test_df_svd = test_df_svd.drop(['headline', 'article_id'], axis=1)

# Sanity checks
print(train_df_svd.isnull().sum().max(), test_df_svd.isnull().sum().max())
print(train_df_svd.shape, test_df_svd.shape)
train_df_svd.head()

In [None]:
# train and test x and y
y_train = train_df_svd['img_id']
x_train = train_df_svd[train_df_svd.columns.difference(['img_id'])]

y_test = test_df_svd['img_id']
x_test = test_df_svd[train_df_svd.columns.difference(['img_id'])]

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

## Modeling

#### Logistic Regression

In [None]:
clf = LogisticRegression(solver='lbfgs', multi_class='multinomial').fit(x_train, y_train)

---