In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# data is the 1% sample from: https://github.com/google-research-datasets/wit/blob/main/DATA.md
df = pd.read_csv("data/wit_v1.train.all-1percent_sample.tsv", sep='\t')
df.head()

In [None]:
en_df = df[df['page_url'].str.contains('https://en.wikipedia.org')]
en_df.head()

In [None]:
new_df = en_df.loc[:, ['page_title', 'image_url', 'context_page_description', 'context_section_description']]
new_df.dropna(inplace=True)
new_df.head()

In [24]:
new_df['content'] = new_df.apply(lambda row: f"{row['context_page_description']}. {row['context_section_description']}", axis=1)
new_df.drop(['context_page_description', 'context_section_description'], axis=1, inplace=True)
new_df.rename(columns={'image_url': 'pic'}, inplace=True)
new_df.rename(columns={'page_title': 'title'}, inplace=True)
new_df.head()

Unnamed: 0,title,pic,content
5,"Chinatown, Sydney",http://upload.wikimedia.org/wikipedia/commons/...,Chinatown is an urban enclave situated in the ...
6,Jayson Musson,https://upload.wikimedia.org/wikipedia/commons...,Jayson Scott Musson is an artist who lives and...
8,Euodynerus megaera,https://upload.wikimedia.org/wikipedia/commons...,Euodynerus megaera is a species of stinging wa...
9,Simon W. Rosendale,https://upload.wikimedia.org/wikipedia/commons...,Simon Wolfe Rosendale was an American lawyer a...
12,1998 Atlantic hurricane season,https://upload.wikimedia.org/wikipedia/commons...,The 1998 Atlantic hurricane season was one of ...


In [None]:
from sklearn.externals import joblib
lda = joblib.load('models/my_book_post_lda.pkl')

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

documents = new_df.apply(lambda row: f"{row['title']} {row['content']}", axis=1)
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
lda = LatentDirichletAllocation(n_components=30)
lda.fit(X)

In [32]:
import joblib
joblib.dump(lda, 'models/my_book_post_lda.pkl')

['models/my_book_post_lda.pkl']

In [33]:
# Get the feature names (words) from the vectorizer
feature_names = vectorizer.get_feature_names_out()
# Print the top words for each topic
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))

Topic 0:
school university station railway college public line city services students
Topic 1:
air force united army states squadron aircraft military command unit
Topic 2:
road bridge route highway line railway traffic north south street
Topic 3:
list tropical storm australia heritage season hurricane sites south solar
Topic 4:
election new united won held time states team american year
Topic 5:
museum park london station garden new baseball art railway opened
Topic 6:
species white type known black common red plant long small
Topic 7:
war ship aircraft navy ships world class british built battle
Topic 8:
species family genus airport known described list marine lc endemic
Topic 9:
people india indian islands south island world region population africa
Topic 10:
wales south new sydney australia van amsterdam family australian dutch
Topic 11:
church st catholic saint temple cathedral century bishop roman christian
Topic 12:
town village municipality population north district area south 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

documents = new_df.apply(lambda row: f"{row['title']} {row['content']}", axis=1)

# Create a CountVectorizer object to vectorize the text data
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(documents)
# Vectorize the text data
X = vectorizer.transform(documents)

# Transform the vectorized text data into the topic space
topic_probs = lda.transform(X)

# Assign the most probable topic to each row
new_df['topic'] = topic_probs.argmax(axis=1)

# Define a mapping from topic indices to topic labels
topic_labels = {
 0: "Public Transportation and Education",
 1: "Military",
 2: "Roads and Highways",
 3: "Natural Disasters and Heritage Sites",
 4: "Elections and Sports",
 5: "Museums, Parks, and Arts",
 6: "Species and Plants",
 7: "Naval Warfare",
 8: "Marine Species and Airports",
 9: "Population and Geography of India and Africa",
 10: "Australia, New South Wales, and Dutch Heritage",
 11: "Religious Buildings and Figures",
 12: "Municipalities and Population",
 13: "Politicians and Public Servants",
 14: "Film and Television Industry",
 15: "War and Government",
 16: "Russian, Ukrainian, and Armenian Cuisine",
 17: "Automobile Industry",
 18: "Art History and Architecture",
 19: "Football (Soccer)",
 20: "Italy, Portugal, France, and Parisian Arrondissement",
 21: "Music Industry",
 22: "Monarchy and Royal Families",
 23: "Historic Buildings and National Landmarks",
 24: "Olympics, Sports, and Protein Science in France",
 25: "Energy, Water, and Light Processes",
 26: "Automobiles and Engines",
 27: "Parks, Mountains, Rivers, Lakes, and Forests",
 28: "Singers, Actors, Birthdays, Nationalities",
 29: "United States Counties, Cities, Rivers",
}

# Replace the topic indices with topic labels
new_df['topic'] = new_df['topic'].map(topic_labels)

new_df.head()


In [40]:
new_df.to_csv('processed-data/my_book_post.csv', index=False)

In [41]:
new_df.to_json('processed-data/my_book_post.json', orient='records')

import pandas as pd
import json

result = new_df.to_json(orient='records')
parsed = json.loads(result)
json_out = json.dumps(parsed, indent=4)

with open('processed-data/my_book_post.json', 'w') as file:
    file.write(json_out)
