# Data Science for Social Justice Workshop Group Project: Ukraine


## 1. Preprocessing

### 1-1. Importing Data with pandas

In [None]:
%pwd ##it will be different for all

In [None]:
import os

In [None]:
os.chdir('data')  ##it will be different for all

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('submissions.csv')

In [None]:
df.shape

In [None]:
# 18 variables
list (df)

In [None]:
df.head(50)

### 1-2. Dropping Columns and Missing Values

In [None]:
##remove some columns that we are not going to use

df = df.drop(['self', 'url', 'subreddit', 'augmented_at', 'augmented_count'], axis=1)
df.head()

In [None]:
##get rid of posts that have been deleted/removed

df = df.loc[~df['selftext'].isin(['[removed]', '[deleted]' ]),:]
df.shape

In [None]:
## drop null values
df = df.dropna(subset=['selftext'])
df.shape

In [None]:
df.head(50)

In [None]:
#Remove posts in Russian
df = df[df['selftext'].map(lambda x: x.isascii())]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
#number of unique users
print (df.iloc[:,4].nunique())



In [None]:
#frequency by user
frequency = df['author'].value_counts()

In [None]:
#top 50
frequency.head(50)


In [None]:
#mean 
frequency.mean()

In [None]:
##check
12421/8723

In [None]:
#average number of words in selftext
count = df['selftext'].str.split().str.len()


In [None]:
count

In [None]:
count.mean()

### 1-3. Cleaning Text Data

In [None]:
##!pip install spacy
##!python -m spacy download en_core_web_sm

In [None]:
# Import spaCy
import spacy
# Load the English preprocessing pipeline
nlp = spacy.load('en_core_web_sm')

In [None]:
# Test: Parse the first reddit post in the dataset
parsed_post = nlp(df.selftext.iloc[0])
print(parsed_post)

In [None]:
# Print each sentence in the parsed post
for idx, sentence in enumerate(parsed_post.sents):  
    ##In python, .sents is used for "sentence segmentation" which is present inside spacy. 
    print(f'Sentence {idx + 1}')
    print(sentence)
    print('') #space

### 1-4. Preprocessing all data

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
from gensim.models.phrases import Phrases, Phraser

In [None]:
# removing URLs 
import re
url_pattern = '(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])'

In [None]:
def clean(token):
    """Helper function that specifies whether a token is:
        - punctuation
        - space
        - digit
    """
    return token.is_punct or token.is_space or token.is_digit

def line_read(df, text_col='selftext'):
    """Generator function to read in text from df and get rid of line breaks."""    
    for text in df[text_col]:
        yield re.sub(pattern=url_pattern,
                     repl="",
                     string=text.replace('\n', ''))

def preprocess(df, text_col='selftext', allowed_postags=['NOUN', 'ADJ']):
    """Preprocessing function to apply to a dataframe."""
    for parsed in nlp.pipe(line_read(df, text_col), batch_size=1000, disable=["tok2vec", "ner"]):
        # Gather lowercased, lemmatized tokens
        tokens = [token.lemma_.lower() if token.lemma_ != '-PRON-'
                  else token.lower_ 
                  for token in parsed if not clean(token)]
        # Remove specific lemmatizations, and words that are not nouns or adjectives
        tokens = [lemma
                  for lemma in tokens
                  if not lemma in ["'s",  "’s", "’"] and not lemma in allowed_postags]
        # Remove stop words
        tokens = [token for token in tokens if token not in spacy.lang.en.stop_words.STOP_WORDS]
        yield tokens
       # Remove url
def remove_URL(df, text_col='selftext'):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", "", text_col)


In [None]:
# This may take a while
lemmas = [line for line in preprocess(df)]

In [None]:
lemmas[32]

In [None]:
df.reset_index().head(50)

### 1-5.Phrase Modeling with `gensim`

In [None]:
from gensim.models.phrases import Phrases, Phraser

# Create bigram and trigram models
bigram = Phrases(lemmas, min_count=10, threshold=100)
trigram = Phrases(bigram[lemmas], min_count=10, threshold=50)  
bigram_phraser = Phraser(bigram)
trigram_phraser = Phraser(trigram)

# Form trigrams
trigrams = [trigram_phraser[bigram_phraser[doc]] for doc in lemmas]

In [None]:
# Join each into a string
trigrams_joined = [' '.join(trigram) for trigram in trigrams]
trigrams_joined[0]

In [None]:
#We can use .keys() to identify the bigrams in the dataset. How many bigrams were identified by the parser?
len(bigram_phraser.phrasegrams.keys())

In [None]:
#Look at biagrams
list(bigram_phraser.phrasegrams.keys())[:10]

In [None]:
# Look at trigrams
[trigram for trigram in list(trigram_phraser.phrasegrams.keys()) if trigram.count('_') == 2]

### 1-6. Save the file after preprocessing

In [None]:
# Inserting next to selftext column
df.insert(loc=7, column='lemmas', value=trigrams_joined)
# Removing empty rows in lemmas
df = df[~df['lemmas'].isin([''])]

In [None]:
df.head()

In [None]:
#most frequent lemmas
#lemmacount = df['lemmas'].value_counts()
## problems: the lemmas in each post is considerred as a single string

In [None]:
# "lemmacount"

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = df['selftext']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
lemma_count = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
# Use this if your scikit-learn is older
# pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [None]:
X

In [None]:
lemma_count.T.sort_values(by=0, ascending=False).head(30)

In [None]:
lemma_count[['patriot','patreon','patriarchal','patriarchate','patriotic','patriotism']]

In [None]:
# example to sum the total counts for lemma "patriot"
total = lemma_count['patriot'].sum()

In [None]:
total

In [None]:
# count all lemmas 
counts = [lemma_count[col].sum() for col in lemma_count]
# Julia    

# Here the "counts" is the total frequency of a lemma in the whole corpus. Still need to make a new dataframe for the lemma and their counts

In [None]:
# # count the actual amount of lemmas using TfidfVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer

# # Settings that you use for count vectorizer will go here
# tfidf_vectorizer = TfidfVectorizer(max_df=0.85,
#                                    decode_error='ignore',
#                                    stop_words='english',
#                                    smooth_idf=True,
#                                    use_idf=True)

# # Fit and transform the texts
# tfidf = tfidf_vectorizer.fit_transform(df['lemmas'])

In [None]:
# lemmas_count = tfidf_vectorizer.vocabulary_

In [None]:
# print(lemmas_count)

In [None]:
# lemmas_count_df = pd.DataFrame.from_dict(lemmas_count, orient='index',columns=['frequency'])

In [None]:
# lemmas_count_df.head(20)

In [None]:
# lemmas_count_df.sort_values(by=['frequency'], ascending=False).head(50)

In [None]:
###rank the count results but didn't get ranked correctly?
#lemmas_count_df.rank(axis=0,method='max',numeric_only=True)

In [None]:
# Save to new csv
df.to_csv('ukraine_lemmas.csv', index=False)

### For the next steps, make sure to use the file: 'ukraine_lemmas.csv'

## 2. Exploring Texts 



### 2-1. Diving Deeper into `pandas`

In [None]:
df = pd.read_csv('ukraine_lemmas.csv')

In [None]:
df.head(3)

In [None]:
# Sort dataframe by highest scores
df.sort_values(by=['score'], ascending=False)[:3]

In [None]:
# rows with a score higher than 500
df_top = df.loc[df['score'] >= 500, :]
len(df_top)

In [None]:
# unique value counts for a column
df.flair_text.value_counts()

### 2-2 Type-token ratio

In [None]:
# compute the TTR

def type_token_ratio(tokens):
    """Calculates type-token ratio on tokens."""
    numTokens = len(tokens)
    numTypes = len(set(tokens))
    return numTypes / numTokens

In [None]:
#loop over the first 10 lemmatized submissions into dataframe

for text in df['lemmas'][:10]:
    tokens = text.split()
    print('Text:\n', text)
    print('TTR:', type_token_ratio(tokens), '\n')

### 2-3 Processing and Analyzing Language with `Text()`

In [None]:
# Run if you do not have nltk installed
##!pip install nltk

In [None]:
tokens = []
for idx, row in enumerate(df['lemmas']):
    # Notice that we put all tokens in the same list
    tokens.extend(row.split(' '))

In [None]:
import nltk
nltk.download('stopwords')
from nltk.text import Text

##aita_tokens = Text(tokens)
ukraine_tokens = Text(tokens) ##MJ: I changed the name to make it corresponding to our dataset

### Concordances

In [None]:
ukraine_tokens.collocation_list()

In [None]:
# Change input arguments
ukraine_tokens.collocation_list(num=30, window_size=3)

### Word Plotting

In [None]:
ukraine_tokens.dispersion_plot(["stay_strong", "title"])

### Similar Words

In [None]:
ukraine_tokens.similar('partner')

### Common Context

In [None]:
ukraine_tokens.common_contexts(['Ukrainian', 'War'])  

## 2.4 Incorporating Time


In [None]:
#new colum with date and time
df.insert(loc=3, column='created_datetime', value=pd.to_datetime(df['created'], unit='s'))
df.head(3)

In [None]:
#create new variables years 
years = pd.DatetimeIndex(df['created_datetime']).year
print(years)

In [None]:
df.head(3)

In [None]:
#before 2013
df_2013 = df.loc[(years <= 2013), :]
len(df_2013)

In [None]:
df_2013

In [None]:
df_2013.flair_css_class.value_counts()

In [None]:
#after 2013 & before 2022
df_b2022 = df.loc[(years <= 2022) & (years >=2013), :]
len(df_b2022)

In [None]:
df_b2022

In [None]:
df_b2022.flair_css_class.value_counts()

In [None]:
#after 2022
df_a2022 = df.loc[(years >= 2022), :]
len(df_a2022)

In [None]:
df_a2022

In [None]:
df_a2022.flair_css_class.value_counts()

In [None]:
# visualize data ###3 different bars + normalized(count -> proportion)
#before 2013
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

#sns.set(rc={'figure.figsize': (7, 5)})

#p = sns.countplot(
#    data=df_2013,
#    x="flair_css_class",) ##it is empty

#plt.xticks(rotation=70)
#plt.tight_layout()

In [None]:
#after 2013 & before 2022
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

sns.set(rc={'figure.figsize': (7, 5)})

p = sns.countplot(
    data=df_b2022,
    x="flair_css_class",)


plt.xticks(rotation=70)
plt.tight_layout()

In [None]:
#after 2022
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

sns.set(rc={'figure.figsize': (7, 5)})

p = sns.countplot(
    data=df_a2022,
    x="flair_css_class",)

plt.xticks(rotation=70)
plt.tight_layout()

In [None]:
#We can save this dataframe as a cvs file so we don't have to run it everytime.

In [None]:
df_2013.to_csv('df_2013.csv', index=False)

In [None]:
df_b2022.to_csv('df_b2022.csv', index=False)

In [None]:
df_a2022.to_csv('df_a2022.csv', index=False)

# 3. Term Frequency-Inverse Document Frequency (TF-IDF)

In [None]:
%pwd 

In [None]:
ukraine = pd.read_csv('ukraine_lemmas.csv')


In [None]:
ukraine.head()

In [None]:
ukraine['lemmas']

In [None]:
# create a matrix of word counts (what we just did with CountVectorizer), and immediately transform them into TF-IDF values.
from sklearn.feature_extraction.text import TfidfVectorizer

# Settings that you use for count vectorizer will go here
tfidf_vectorizer = TfidfVectorizer(max_df=0.85,
                                   decode_error='ignore',
                                   stop_words='english',
                                   smooth_idf=True,
                                   use_idf=True,
                                  )

# Fit and transform the texts
tfidf = tfidf_vectorizer.fit_transform(ukraine['lemmas'])

In [None]:
tfidf

In [None]:
# Place TF-IDF values in a DataFrame
df_tfidf = pd.DataFrame(tfidf.todense(), columns=tfidf_vectorizer.get_feature_names_out().ravel())

In [None]:
#tfidf_vectorizer.get_feature_names()
#print(tfidf_vectorizer.vocabulary_)
### not sure whetehr they are helpful here

In [None]:
df_tfidf.shape

In [None]:
df_tfidf

In [None]:
# highest "average" TF-IDF across documents
df_tfidf.sum().sort_values(ascending=False)

In [None]:
# average value of tfidf
df_tfidf.mean().sort_values(ascending=False)

In [None]:
#the tfidf value of the first post submission
df_tfidf.iloc[0].sort_values(ascending=False)

In [None]:
ukraine['selftext'].iloc[9]

In [None]:
#the tfidf value of the 10th post submission
df_tfidf.iloc[9].sort_values(ascending=False).head(20)

## TF-IDF before 2013

In [None]:
import pandas as pd
%pwd

In [None]:
import os
os.chdir('data') #this will vary by user

In [None]:
df_2013 = pd.read_csv('df_2013.csv')

In [None]:
df_2013['lemmas']

In [None]:
# create a matrix of word counts (what we just did with CountVectorizer), and immediately transform them into TF-IDF values.
from sklearn.feature_extraction.text import TfidfVectorizer

# Settings that you use for count vectorizer will go here
tfidf_vectorizer = TfidfVectorizer(max_df=0.85,
                                   decode_error='ignore',
                                   stop_words='english',
                                   smooth_idf=True,
                                   use_idf=True,
                                  )

# Fit and transform the texts
tfidf_2013 = tfidf_vectorizer.fit_transform(df_2013['lemmas'])

In [None]:
tfidf_2013

In [None]:
# Place TF-IDF values in a DataFrame
df_tfidf_2013 = pd.DataFrame(tfidf_2013.todense(), columns=tfidf_vectorizer.get_feature_names_out().ravel())

In [None]:
# highest "average" TF-IDF across documents
df_tfidf_2013.sum().sort_values(ascending=False)

## TF-IDF 2014-2022

In [None]:
df_b2022 = pd.read_csv('df_b2022.csv')

In [None]:
df_b2022['lemmas']

In [None]:
# create a matrix of word counts (what we just did with CountVectorizer), and immediately transform them into TF-IDF values.
from sklearn.feature_extraction.text import TfidfVectorizer

# Settings that you use for count vectorizer will go here
tfidf_vectorizer = TfidfVectorizer(max_df=0.85,
                                   decode_error='ignore',
                                   stop_words='english',
                                   smooth_idf=True,
                                   use_idf=True,
                                  )

# Fit and transform the texts
tfidf_b2022 = tfidf_vectorizer.fit_transform(df_b2022['lemmas'])

In [None]:
tfidf_b2022

In [None]:
# Place TF-IDF values in a DataFrame
df_tfidf_b2022 = pd.DataFrame(tfidf_b2022.todense(), columns=tfidf_vectorizer.get_feature_names_out().ravel())

In [None]:
# highest "average" TF-IDF across documents
df_tfidf_b2022.sum().sort_values(ascending=False)

## TF-IDF after 2022

In [None]:
df_a2022 = pd.read_csv('df_a2022.csv')

In [None]:
df_a2022['lemmas']

In [None]:
# create a matrix of word counts (what we just did with CountVectorizer), and immediately transform them into TF-IDF values.
from sklearn.feature_extraction.text import TfidfVectorizer

# Settings that you use for count vectorizer will go here
tfidf_vectorizer = TfidfVectorizer(max_df=0.85,
                                   decode_error='ignore',
                                   stop_words='english',
                                   smooth_idf=True,
                                   use_idf=True,
                                  )

# Fit and transform the texts
tfidf_a2022 = tfidf_vectorizer.fit_transform(df_a2022['lemmas'])

In [None]:
tfidf_a2022

In [None]:
# Place TF-IDF values in a DataFrame
df_tfidf_a2022 = pd.DataFrame(tfidf_a2022.todense(), columns=tfidf_vectorizer.get_feature_names_out().ravel())

In [None]:
# highest "average" TF-IDF across documents
df_tfidf_a2022.sum().sort_values(ascending=False)

# 3. Topic Modeling

## 3-1. Building Topic Models on Ukraine


In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline

In [None]:
df.head(3)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
ukraine = pd.read_csv('ukraine_lemmas.csv')
X = ukraine['lemmas']
# Vectorize, using only the top 5000 TF-IDF values
vectorizer = TfidfVectorizer(max_features=5000)

tfidf =  vectorizer.fit_transform(X)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=3, max_iter=20, random_state=1)
lda = lda.fit(tfidf)

In [None]:
#Defining the fuction
def plot_top_words(model, feature_names, n_top_words=10, n_row=1, n_col=3, normalize=False):
    """Plot the top words for an LDA model.
    
    Parameters
    ----------
    model : LatentDirichletAllocation object
        The trained LDA model.
    feature_names : list
        A list of strings containing the feature names.
    n_top_words : int
        The number of top words to show for each topic.
    n_row : int
        The number of rows to use in the subplots.
    n_col : int
        The number of columns to use in the subplots.
    normalize : bool
        If True, normalizes the topic model weights.
    """
    fig, axes = plt.subplots(n_row, n_col, figsize=(3 * n_col, 5 * n_row), sharex=True)
    axes = axes.flatten()
    components = model.components_
    if normalize:
        components = components / components.sum(axis=1)[:, np.newaxis]

    for topic_idx, topic in enumerate(components):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 20})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)

        for i in "top right left".split():
            ax.spines[i].set_visible(False)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)

    return fig, axes

In [None]:
token_names = vectorizer.get_feature_names_out()
plot_top_words(lda, token_names, 25)
plt.show()

- Topic 1: Russia on Ukraine war
- Topic 2: Travel/Immigration
- Topic 3: Social media/Type of posts

- Other Comments: Ukrainian president not listed, how come?

## 3-2. Topic Weights Across Documents


In [None]:
topic_distributions = lda.transform(tfidf)

In [None]:
print(tfidf.shape)
print(topic_distributions.shape)
print(topic_distributions)

In [None]:
# Generic topic names
columns = [
    "Topic 1",
    "Topic 2",
    "Topic 3"
]

# Or, choose topics
columns = [
    "War",
    "Travel/Immigration",
    "Social media/Type of posts"
]

In [None]:
topic_df = pd.DataFrame(topic_distributions, columns=columns)
topic_df.head()

In [None]:
topic_df.insert(loc=0, column='text', value=ukraine['selftext'])
topic_df.head()

In [None]:
idxs = [1, 2, 3]

for idx in idxs:
    print(topic_df['text'].iloc[idx][:500])
    print(topic_df.iloc[idx, 1:])
    print('----')

## 3-3. Change the number of topics

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
ukraine = pd.read_csv('ukraine_lemmas.csv')
X = ukraine['lemmas']
# Vectorize, using only the top 5000 TF-IDF values
vectorizer = TfidfVectorizer(max_features=5000)

tfidf =  vectorizer.fit_transform(X)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=5, max_iter=20, random_state=1)
lda = lda.fit(tfidf)

In [None]:
#Defining the fuction
def plot_top_words(model, feature_names, n_top_words=10, n_row=1, n_col=5, normalize=False):
    """Plot the top words for an LDA model.
    
    Parameters
    ----------
    model : LatentDirichletAllocation object
        The trained LDA model.
    feature_names : list
        A list of strings containing the feature names.
    n_top_words : int
        The number of top words to show for each topic.
    n_row : int
        The number of rows to use in the subplots.
    n_col : int
        The number of columns to use in the subplots.
    normalize : bool
        If True, normalizes the topic model weights.
    """
    fig, axes = plt.subplots(n_row, n_col, figsize=(3 * n_col, 5 * n_row), sharex=True)
    axes = axes.flatten()
    components = model.components_
    if normalize:
        components = components / components.sum(axis=1)[:, np.newaxis]

    for topic_idx, topic in enumerate(components):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 20})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)

        for i in "top right left".split():
            ax.spines[i].set_visible(False)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)

    return fig, axes

In [None]:
token_names = vectorizer.get_feature_names_out()
plot_top_words(lda, token_names, 25)
plt.show()

#Interpretation
- Topic 1
- Topic 2
- Topic 3
- Topic 4
- Topic 5

In [None]:
topic_distributions = lda.transform(tfidf)

In [None]:
print(tfidf.shape)
print(topic_distributions.shape)
print(topic_distributions)

In [None]:
# Generic topic names
columns = [
    "Topic 1",
    "Topic 2",
    "Topic 3",
    "Topic 4",
    "Topic 5"
]

# Or, choose topics
columns = [
    "War",
    "",
    "", 
    "",
    ""
]

In [None]:
topic_df = pd.DataFrame(topic_distributions, columns=columns)
topic_df.head()

In [None]:
topic_df.insert(loc=0, column='text', value=ukraine['selftext'])
topic_df.head()

In [None]:
idxs = [1, 2, 3]

for idx in idxs:
    print(topic_df['text'].iloc[idx][:500])
    print(topic_df.iloc[idx, 1:])
    print('----')

## 3-4. Topic Modeling in different phases(before 2013, between 2014-2022), after 2022) 

### 3-4-1. Topic Modeling before 2013

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

X = df_2013['lemmas']
# Vectorize, using only the top 5000 TF-IDF values
vectorizer = TfidfVectorizer(max_features=5000)

tfidf_2013 =  vectorizer.fit_transform(X)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda_2013 = LatentDirichletAllocation(n_components=5, max_iter=20, random_state=0)
lda_2013 = lda_2013.fit(tfidf_2013)

In [None]:
#Defining the fuction
def plot_top_words(model, feature_names, n_top_words=10, n_row=1, n_col=5, normalize=False):
    """Plot the top words for an LDA model.
    
    Parameters
    ----------
    model : LatentDirichletAllocation object
        The trained LDA model.
    feature_names : list
        A list of strings containing the feature names.
    n_top_words : int
        The number of top words to show for each topic.
    n_row : int
        The number of rows to use in the subplots.
    n_col : int
        The number of columns to use in the subplots.
    normalize : bool
        If True, normalizes the topic model weights.
    """
    fig, axes = plt.subplots(n_row, n_col, figsize=(3 * n_col, 5 * n_row), sharex=True)
    axes = axes.flatten()
    components = model.components_
    if normalize:
        components = components / components.sum(axis=1)[:, np.newaxis]

    for topic_idx, topic in enumerate(components):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 20})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)

        for i in "top right left".split():
            ax.spines[i].set_visible(False)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)

    return fig, axes

In [None]:
token_names = vectorizer.get_feature_names_out()
plot_top_words(lda_2013, token_names, 25)
plt.show()

- Topic 1: Daily life/Travel
- Topic 2: 
- Topic 3: 
- Comments: No war-related words

### 3-4-1. Topic Modeling between 2014 and 2022

In [7]:
%pwd 

'C:\\Users\\moren\\OneDrive\\Documents\\Third Year\\Summer 22\\Data Science\\Data-Science-Social-Justice-main\\Project'

In [8]:
os.chdir('data') 

In [11]:
df_a2022 = pd.read_csv('df_a2022.csv')

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

X = df_a2022['lemmas']
# Vectorize, using only the top 5000 TF-IDF values
vectorizer = TfidfVectorizer(max_features=5000)

tfidf_a2022 =  vectorizer.fit_transform(X)

In [15]:
from sklearn.decomposition import LatentDirichletAllocation
lda_a2022 = LatentDirichletAllocation(n_components=5, max_iter=20, random_state=0)
lda_a2022 = lda_a2022.fit(tfidf_a2022)

In [16]:
#Defining the fuction
def plot_top_words(model, feature_names, n_top_words=10, n_row=1, n_col=5, normalize=False):
    """Plot the top words for an LDA model.
    
    Parameters
    ----------
    model : LatentDirichletAllocation object
        The trained LDA model.
    feature_names : list
        A list of strings containing the feature names.
    n_top_words : int
        The number of top words to show for each topic.
    n_row : int
        The number of rows to use in the subplots.
    n_col : int
        The number of columns to use in the subplots.
    normalize : bool
        If True, normalizes the topic model weights.
    """
    fig, axes = plt.subplots(n_row, n_col, figsize=(3 * n_col, 5 * n_row), sharex=True)
    axes = axes.flatten()
    components = model.components_
    if normalize:
        components = components / components.sum(axis=1)[:, np.newaxis]

    for topic_idx, topic in enumerate(components):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 20})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)

        for i in "top right left".split():
            ax.spines[i].set_visible(False)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)

    return fig, axes

In [23]:
token_names = vectorizer.get_feature_names_out()
plot_top_words(lda_a2022, token_names, 25)
plt.show()
#nor working to show the plot

NameError: name 'plt' is not defined

### 3-4-3. Topic Modeling after 2022

In [19]:
df_b2022 = pd.read_csv('df_b2022.csv')

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

X = df_b2022['lemmas']
# Vectorize, using only the top 5000 TF-IDF values
vectorizer = TfidfVectorizer(max_features=5000)

tfidf_b2022 =  vectorizer.fit_transform(X)

In [25]:
from sklearn.decomposition import LatentDirichletAllocation
lda_b2022 = LatentDirichletAllocation(n_components=5, max_iter=20, random_state=0)
lda_b2022 = lda_b2022.fit(tfidf_b2022)

In [26]:
#Defining the fuction
def plot_top_words(model, feature_names, n_top_words=10, n_row=1, n_col=5, normalize=False):
    """Plot the top words for an LDA model.
    
    Parameters
    ----------
    model : LatentDirichletAllocation object
        The trained LDA model.
    feature_names : list
        A list of strings containing the feature names.
    n_top_words : int
        The number of top words to show for each topic.
    n_row : int
        The number of rows to use in the subplots.
    n_col : int
        The number of columns to use in the subplots.
    normalize : bool
        If True, normalizes the topic model weights.
    """
    fig, axes = plt.subplots(n_row, n_col, figsize=(3 * n_col, 5 * n_row), sharex=True)
    axes = axes.flatten()
    components = model.components_
    if normalize:
        components = components / components.sum(axis=1)[:, np.newaxis]

    for topic_idx, topic in enumerate(components):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 20})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)

        for i in "top right left".split():
            ax.spines[i].set_visible(False)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)

    return fig, axes

In [28]:
token_names = vectorizer.get_feature_names_out()
plot_top_words(lda_b2022, token_names, 25)
plt.show()
#not working to show the plot

NameError: name 'plt' is not defined

# 4. Word Embeddings

### 4-1. Between 2014 and 2022


In [30]:
# Package imports
import os
import pandas as pd
import pickle
%pwd
os.chdir('data') 

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'data'

In [31]:
# Import dataset
df_b2022 = pd.read_csv('df_b2022.csv')
df.head(3)
print(df.shape)
df.head()

(12292, 15)


Unnamed: 0,idint,idstr,created,created_datetime,nsfw,author,title,selftext,lemmas,score,distinguish,textlen,num_comments,flair_text,flair_css_class
0,111782174,t3_1ujvoe,1389030774,2014-01-06 17:52:54,0,MaFi0s0,Speaking Russian in small Ukrainian cities?,"I understand half of Ukraine speaks Russian, s...",understand half ukraine speaks russian plan le...,4,,351,16,,
1,111982888,t3_1uo6js,1389143126,2014-01-08 01:05:26,0,[deleted],Met a Ukrainian girl and was wondering if y'al...,I was just wondering if y'all could teach me s...,wondering teach phrases impress tried learn ho...,0,,169,2,,
2,112040900,t3_1upfb8,1389183858,2014-01-08 12:24:18,0,IdiotBrit,Moving to Ukraine,I brought my wife to England a few years ago. ...,brought wife england years_ago actually wife f...,9,,654,7,,
3,113297129,t3_1vgcmh,1389974937,2014-01-17 16:08:57,0,[deleted],American gifts for Ukrainian villagers,"Moving to Ukraine soon, and living in small Ru...",moving ukraine soon living small russian speak...,1,,193,4,,
4,113801822,t3_1vr61q,1390302825,2014-01-21 11:13:45,0,PocketSandInc,Lets start a collection of all the live stream...,Here's what I have so far:\n\n* https://www.yo...,far url url url url url url url url url,16,,475,3,,


In [32]:
trigrams = [lemma.split(' ') for lemma in df['lemmas']]

## 5-1 Constructing a Word2Vec Model

In [33]:
from gensim.models import Word2Vec
import multiprocessing

In [34]:
# Count the number of cores you have at your disposal
cores = multiprocessing.cpu_count()
# Word vector dimensionality (how many features each word will be given)
n_features = 300
# Minimum word count to be taken into account
min_word_count = 10
# Number of threads to run in parallel (equal to your amount of cores)
n_workers = cores
# Context window size
window = 5
# Downsample setting for frequent words
downsampling = 1e-2
# Seed for the random number generator (to create reproducible results)
seed = 1 
# Skip-gram = 1, CBOW = 0
sg = 1
epochs = 20

model = Word2Vec(
    sentences=trigrams,
    workers=n_workers,
    vector_size=n_features,
    min_count=min_word_count,
    window=window,
    sample=downsampling,
    seed=seed,
    sg=sg)

In [35]:
model.train(trigrams, total_examples=model.corpus_count, epochs=10)        

(3361640, 3954780)

In [38]:
model.save('aita.emb')

In [39]:
model = Word2Vec.load('aita.emb')

In [40]:
len(model.wv)

5051

In [41]:
model.wv.index_to_key[0]

'ukraine'

In [42]:
model.wv.vectors[0]

array([-1.05339877e-01,  1.15733460e-01,  1.54810781e-02,  1.32737949e-01,
       -1.07326552e-01, -1.77850440e-01,  8.69322047e-02,  3.04783024e-02,
        1.13598727e-01, -9.01125893e-02,  3.02714676e-01, -1.16310060e-01,
        3.57329138e-02,  2.26721540e-02, -2.32074693e-01, -2.45371640e-01,
        4.36770096e-02, -1.00320481e-01, -2.47257069e-01, -3.10011625e-01,
        1.97342187e-01,  1.58299774e-01,  3.23055014e-02,  3.11491378e-02,
        1.85619175e-01,  1.79060832e-01,  6.75676099e-04,  1.18636638e-01,
       -2.19166115e-01,  1.97160188e-02, -1.69005483e-01, -2.02404425e-01,
        5.74063808e-02, -1.08900003e-01, -4.96282950e-02, -4.37690951e-02,
        1.78953782e-01, -4.22112644e-01, -6.17015287e-02, -4.01663873e-03,
       -1.33598953e-01,  1.33213267e-01,  7.90039822e-02,  7.94901103e-02,
        1.11242875e-01, -6.77441210e-02, -1.09021761e-01,  7.81679079e-02,
        6.60476554e-03,  1.42786533e-01, -1.10919856e-01,  6.67778263e-03,
       -1.72186285e-01, -

In [None]:
#Not sure how far we want to go with this 