In [1]:
# reading cnn_samples and federal_samples

import pandas as pd
import ast
import numpy as np

df1 = pd.read_csv("cnn_samples.csv")
df2 = pd.read_csv("federal_samples.csv")
df2.rename(columns = {"embeddings": "embedding"}, inplace = True)

# fix embedding column
embed_col = []
for i in range(len(df1["embedding"])):
    embed_col.append(df1["embedding"][i])

for i in range(len(df2["embedding"])):
    embed_col.append(df2["embedding"][i])

for i in range(len(df1["embedding"])+len(df2["embedding"])):
    embed_col[i] = ast.literal_eval(embed_col[i])

df = pd.concat([df1, df2])
df["embedding"] = embed_col

In [5]:
# clustering data

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import preprocessing

# inertias = []
# max_clusters = 30

# for i in range(1, max_clusters + 1):
#     kmeans = KMeans(n_clusters=i, random_state=0).fit(preprocessing.normalize(df["embedding"].tolist()))
#     inertias.append(kmeans.inertia_)

In [6]:
# graphing inertias

# plt.plot(range(1, max_clusters + 1),inertias, '-bo')
# plt.xlabel("k values")
# plt.ylabel("inertia")
# plt.show()

We determined that k = 12 is optimal because after k = ~12, the inertia stops rapidly decreasing.

In [7]:
# add labels to dataframe

kmeans = KMeans(n_clusters=12, random_state=0).fit(preprocessing.normalize(df["embedding"].tolist()))

if "labels" in df.columns:
    df["labels"] = kmeans.labels_
else:
    df.insert(0, "labels", kmeans.labels_)

In [8]:
# make predictions about categories

categories = [
    "Technology",
    "Money",
    "Law",
    "Environment",
    "Other",
    "Health",
    "Employment",
    "World News",
    "Politics",
    "Entertainment",
    "Violence",
    "Other"
]

In [9]:
# import challenge articles

dfc = pd.read_csv("challenge.csv")
dfc.rename(columns = {"embeddings": "embedding"}, inplace = True)

embed_col = []
for i in range(len(dfc["embedding"])):
    embed_col.append(ast.literal_eval(dfc["embedding"][i]))

dfc["embedding"] = embed_col

In [10]:
# predict categories for 5 challenge articles

for i in kmeans.predict(preprocessing.normalize(dfc["embedding"].tolist())):
    print(categories[i])

Entertainment
Entertainment
Health
World News
Environment


In [11]:
# print top 3 articles most similar to challenge articles

from scipy import spatial

def cos_similarity(a, b):
    return 1 - spatial.distance.cosine(a, b)

for i in range(0, 5):
    similarities = [cos_similarity(embedding, dfc["embedding"][i]) for embedding in df["embedding"]]
    top_3 = sorted(similarities, reverse=True)[0:3]

    print(top_3)
    print(df.iloc[similarities.index(top_3[0]), 3])
    print(df.iloc[similarities.index(top_3[1]), 3])
    print(df.iloc[similarities.index(top_3[2]), 3])
    print()

[0.8070927869054605, 0.8047635531994215, 0.803327863916594]
LONDON, England (CNN) -- The Screening Room went to the Theatre Royal in Drury Lane in the heart of London's West End, where a spectacular musical version of "The Lord of The Rings" is enjoying a successful run, to meet Indian composer A. R. Rahman, whose blend of Asian culture with rock and Western classical styles has revolutionized the Indian film industry. And now Rahman is about to make his mark in Hollywood. A. R. Rahman, interviewed by CNN's Screening Room . Virtually unknown in Europe and the U.S., Rahman has sold 200 million albums worldwide -- more than the Beatles -- and is worshipped throughout much of Asia, where he's known as the Mozart of Madras. Now he has added a Hollywood film score to his vast repertoire of movie music. Rahman explained to CNN what he thinks makes great movie music. "A great soundtrack is like 'Laura"s theme', the 'Love Story' theme, 'Chariots of Fire' and all those sorts of things, where it

In [193]:
# import mystery challenge

dfj = pd.read_json("mystery.json")
dfj.rename(columns = {"embeddings": "embedding"}, inplace = True)

In [194]:
# predict category of mystery article
print(categories[kmeans.predict(preprocessing.normalize([dfj["embedding"]]))[0]])

World News


In [195]:
# print articles most similar to mystery article

similarities = [cos_similarity(embedding, dfj["embedding"]) for embedding in df["embedding"]]
print(df.iloc[similarities.index(max(similarities)), 3])

(CNN) -- At least 37 people, including 26 civilians and some Syrian soldiers, were killed in a car bombing Sunday at a checkpoint on the outskirts of Hama, the Syrian Observatory for Human Rights and state media said. SANA, a Syrian-run news agency, said the death toll could rise because at least 10 more people were seriously wounded. A suicide bomber detonated more than a ton of explosives in a truck on a busy street near a farm machinery company, SANA said. More than 20 vehicles and some homes and stores were damaged, it reported. Meanwhile, clashes between the Syrian military and rebel brigades continue to rage on the eastern outskirts of the government-held city, the London-based Syrian opposition group said. Lebanese hostages released after 17 months . Another opposition group, the Local Coordination Committees in Syria, reported at least 15 killings in other parts of the country. Eight people were killed in Damascus and its suburbs, six in Aleppo province and one in Homs province

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=8acd1949-f0b2-4f72-a9f9-393c67b9ad4b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>