<a href="https://colab.research.google.com/github/nandinijirobe/CS521_FinalProject/blob/main/Copy_of_Final_CS_521_Research_Project_4_22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Install Packages**

In [None]:
!pip install convokit # used to access reddit corpus
!pip install codeswitch # nlp tool for hinglish language identification, pos tagging
!pip install cleantext # used to clean raw text data
!pip install demoji # used to remove presence of emojis from text

# **Get Reddit Repo Lists**

In [None]:
# Allows us to read/write files that are present in personal drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

# The subreddits_file contains a list of subreddits relevant to India and Hindi
subreddits_file = open(r'/content/drive/MyDrive/CS521_FinalProject/Resources/unfamiliar_indian_subreddits.txt', 'r')

# The below code creates a dataframe containing information about each subreddit
# and the number of utterances it contains
data = []
for line in subreddits_file:
  subreddit = line[0:line.index(".corpus")]
  count = int(line[line.index(":")+3:].strip())
  # print(subreddit, count)
  data.append([subreddit, count])

subreddits_df = pd.DataFrame(data, columns=['subreddits', 'num_utterances'])
subreddits_df = subreddits_df.sort_values(by=['num_utterances'], ascending=False)

In [None]:
subreddits_df.head(10)

In [None]:
print("Total utterances across all subreddits:", subreddits_df['num_utterances'].sum())
print("Total subreddits:", len(subreddits_df))

# **Merge All Dataframes**

In [None]:
from convokit import Corpus, download
"""
The subreddit we will be working with for this project is IndiaSpeaks
It is a general subreddit ews, entertainment, science & technology, sports, history
& culture, economy and geopolitics related to India.
"""
master_corpus = Corpus(filename=download("subreddit-IndiaSpeaks")) #download data from the IndiaSpeaks subreddit
total_utter_count = len(master_corpus.utterances) # store the total number of utterances in this subreddit

"""
The below code merges the data from all India/Hindi related subreddits into one dataframe
Uncomment only if enough compute power and storage is available
Otherwise continue to work with IndiaSpeaks subreddit as it is large and general
"""
# for subreddit in all_subreddits:
#   subreddit_name = "subreddit-" + subreddit
#   print("Currently downloading", subreddit_name, "...")
#   corpus = Corpus(filename=download(subreddit_name))
#   master_corpus =  Corpus.merge(master_corpus, corpus)
#   total_utter_count += len(corpus.utterances)

print(total_utter_count)

# **Cleaning**

In [None]:
from codeswitch.codeswitch import LanguageIdentification # import hinglish language identification tool
lid = LanguageIdentification('hin-eng') # load model to classify words

In [None]:
import cleantext # used to clean raw text data
import demoji # used to remove presence of emojis
cleaned = 0
def clean_utterance(text):
  global cleaned
  print((cleaned/len(master_corpus)) * 100, "% Cleaned")
  cleaned += 1
  text = cleantext.clean(text, extra_spaces=True, lowercase=True, numbers=True, punct=True)
  text = demoji.replace(text, "")
  return text

In [None]:
# extract just the utterances from the entire corpus
master_corpus = master_corpus.get_utterances_dataframe()
# remove empty texts form dataframe
master_corpus = master_corpus[(master_corpus["text"] != "[removed]") & (master_corpus["text"] != "") & (master_corpus["text"] != "[deleted]")]

In [None]:
# clean all utterances
master_corpus["text"] = master_corpus["text"].apply(clean_utterance)

In [None]:
# remove empty text again after cleaning existing text
master_corpus = master_corpus[(master_corpus["text"].str.strip() != "")]

# Cleaning takes a while. Store this file as a csv file into drive so we won't have run code again
master_corpus.to_csv(r'/content/drive/MyDrive/CS521_FinalProject/Resources/cleaned_corpus.csv')

# **Break Down Data**

In [None]:
# Reload the file into master corpus in case the runtime disconnected -- no need to run above cells
import pandas as pd
master_corpus = pd.read_csv(r'/content/drive/MyDrive/CS521_FinalProject/Resources/cleaned_corpus.csv')

In [None]:
# master_corpus (which contains IndiaSpeaks subreddit) still has a lot of data
# working with all the data at once will give us runtime issues so we need to do
# it in batches or make it smaller
corpus_size = len(master_corpus)
slice_size = corpus_size//30
print("Corpus size:", corpus_size)
print("Slice size:", slice_size)
# We are going to be working with 15k texts specifically

In [None]:
# We need to slice the dataframe and perform batch processing so we don't get a runtime error from running the code for too long
master_corpus1 = master_corpus.iloc[0:slice_size]
# master_corpus2 = master_corpus.iloc[slice_count:slice_count*2]
# master_corpus3 = master_corpus.iloc[slice_count*2:slice_count*3]
# master_corpus4 = master_corpus.iloc[slice_count*3:total_utter_count]

# **Filter out Hinglish Text**

In [None]:
"""
This function determines whether a piece of text is Hinglish.
The condition for a text to qualify is that it should have at
least 20% Hindi and 20% English.
"""
utter_checked = 1 # this keeps track of the number of texts the below function has checked if it is hinglish
hinglish_found = 0 # this keeps track of the number of hinglish texts dounf

def isHinglish (text):
  global hinglish_found
  global utter_checked
  print(utter_checked/len(master_corpus) * 100, "% Checked")
  utter_checked += 1
  if isinstance(text, str) and text: # check if text is string and not null
    word_list = text.split() # get the list of words in text

    hin_count = 0
    eng_count = 0

    # categorize each word in text as either hindi or english
    result = lid.identify(text)
    for map in result:
      if "#" in map['word']:
        # This is from a splitted word.
        # Already has been counted. Ignore.
        continue
      else:
        if map['entity'] == "hin":
          # print("hindi", map['word'])
          hin_count += 1
        else:
          # print("english", map['word'])
          eng_count += 1
    # calculate the percentages of hindi and english text
    hin_pct = round((hin_count/len(word_list))*100, 2)
    eng_pct = round((eng_count/len(word_list))*100, 2)
    # print("Hindi percentage: ", hin_pct)
    # print("English percentage: ", eng_pct)

    # A text will only be considered Hinglish if it contains
    # 20% hindi and 20% english minimum
    if (hin_pct >= 20 and eng_pct >= 20):
      hinglish_found += 1
      print("Hinglish found", hinglish_found)
    return (hin_pct >= 20 and eng_pct >= 20)

In [None]:
# only keep hinglish text
master_corpus1 = master_corpus1[master_corpus1["text"].apply(isHinglish)]

In [None]:
# filtering out the hinglish text form master_corpus1 took a while. saving it to a csv so we won't have to redo it
master_corpus1 = master_corpus1["text"]
master_corpus1.to_csv("/content/drive/MyDrive/CS521_FinalProject/Resources/master_corpus1_hinglish.csv", index=False)


# **Remove Text with Devanagari Text**

In [None]:
import pandas as pd
master_corpus1 = pd.read_csv("/content/drive/MyDrive/CS521_FinalProject/Resources/master_corpus1_hinglish.csv")

In [None]:
# ensure there is no text which contains devanagari text because we are only focusing on text written using roman script
"""
The devangari alphabet ranges from 0900 to 097F.
The code below creates a string containing all
the alphabets' unicodes.This was created to check
if a text contained any devangari alphabets.
"""
import re
devangariAlphabet = ""
uniList = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
           "A", "B", "C", "D", "E", "F"]
for i in range (90, 97):
  for j in uniList:
    devangariAlphabet += ("\\u0" + str(i) + str(j))
devangariAlphabet = "[" + devangariAlphabet + "]"

# checks if text contains and devangari alphabets
def hasDevangariAlpha(text):
  pattern = re.compile(devangariAlphabet)
  # findall returns a tuple of the matches found
  matches = pattern.findall(text)
  if (len(matches) > 0):
    return True
  return False

In [None]:
# remove any text that contains devangari letters from the data
master_corpus1 = master_corpus1[~master_corpus1["text"].apply(hasDevangariAlpha)]

In [None]:
import pandas as pd
# update the saved csv file again
master_corpus1.to_csv("/content/drive/MyDrive/CS521_FinalProject/Resources/master_corpus1_hinglish.csv", index=False)

# **POS Tagging**

In [None]:
master_corpus1.head(5)

In [None]:
from codeswitch.codeswitch import POS
pos = POS('hin-eng')
from codeswitch.codeswitch import LanguageIdentification
lid = LanguageIdentification('hin-eng') # load model to classify words

In [None]:
"""
This function splits each sentence into tokens and POS tags each token.
It keeps track of total Hindi and English counts for each POS tag and updates
global dictionaries. It is used to figure out when a person is more likely to code
switch to another language.
"""
eng_pos_counts = {} # will contain pos tag counts for all english words
hin_pos_counts = {}# will contain pos tag counts for all hindi words
hin_total = 0 # will store the total number of hindi words
eng_total = 0 # will store the total number of english words
total_word_count = 0
total_pos_counts = {} # will contain pos tag counts for all words
utter_count = 0
def pos_tagger (text):
  global total_word_count
  global hin_total
  global eng_total
  global utter_count
  word_list = text.split()
  lid_result = lid.identify(text) # give each word in text hindi/english tag
  pos_result = pos.tag(text) # give each word in text pos tag
  utter_count += 1 # keeps track of the number of texts function has processed
  print(((utter_count/len(master_corpus1))*100), "% has been POS tagged...")

  # Store the POS tag counts for text based on language
  for i in range(len(word_list)):
    if "#" in lid_result[i]['word']:
      # This is from a splitted word.
      # Already has been counted. Ignore.
      continue
    else:
      if pos_result[i]['entity'] in total_pos_counts: # update total pos tags container
        total_pos_counts[pos_result[i]['entity']] += 1
      else:
        total_pos_counts[pos_result[i]['entity']] = 1

      if lid_result[i]['entity'] == "hin": # update hindi pos tags container
        hin_total += 1
        if pos_result[i]['entity'] in hin_pos_counts:
          hin_pos_counts[pos_result[i]['entity']] += 1
        else:
          hin_pos_counts[pos_result[i]['entity']] = 1
      else:
        eng_total += 1
        if pos_result[i]['entity'] in eng_pos_counts: # update english pos tags container
          eng_pos_counts[pos_result[i]['entity']] += 1
        else:
          eng_pos_counts[pos_result[i]['entity']] = 1

  total_word_count += len(word_list)
  return


In [None]:
master_corpus1["text"] = master_corpus1["text"].apply(lambda x: x.replace("\n", " ")) # needed to do some extra cleaning
master_corpus1["text"].apply(pos_tagger) # start calculating the pos tags for all texts in dataframe

In [None]:
master_corpus1.head(20)

# **Preparing Data for Graphing**

In [None]:
import pandas as pd
import math

# The dictionaries created above for each language's pos_counts need to be used to create
# graphs. However they need to be made into dataframes if we want to use the seaborn library
# The below code combines the dictionaries and makes a dataframe
# make these dictionaries into a dataframe so we can save results as a csv and also use it make seaborn graphs
def two_dicts_to_df(dict1, dict2, total_dict, tag_type):
  tags = total_pos_counts.keys()
  # ensure both dictionaries have same number of keys
  for tag in tags:
    if tag not in dict1:
      dict1[tag] = 0
    if tag not in dict2:
      dict2[tag] = 0

  # sort dictionaries based on keys
  dict1 = dict(sorted(dict1.items(), key=lambda x: x[1], reverse=True))
  dict2 = dict(sorted(dict2.items(), key=lambda x: x[1], reverse=True))

  data = {tag_type + "_tags":[], "language":[], "count":[]} # create columns for dataframe
  for tag in tags: # add information that will be stored in dictionary
    data[tag_type + "_tags"].append(tag)
    data["language"].append("English")
    data["count"].append(int((dict1[tag]/total_dict[tag]) * 100))

    data[tag_type + "_tags"].append(tag)
    data["language"].append("Hindi")
    data["count"].append(int((dict2[tag]/total_dict[tag]) * 100))

  df = pd.DataFrame(data) # create the dataframe
  return df

pos_df = two_dicts_to_df(eng_pos_counts, hin_pos_counts, total_pos_counts, "pos")
pos_df.head(20)

# **Results shown as Graph**

In [None]:
import seaborn as sns
sns.set_theme(style="dark")

g = sns.catplot(
    data=pos_df, kind="bar",
    x="pos_tags", y="count", hue="language",
    errorbar="sd", palette="dark", alpha=.6, height=5, aspect=2
)
g.despine(left=True)
g.set_xticklabels(rotation=20)
g.set_axis_labels("POS Tags", "% Out of Total Tags")
g.legend.set_title("Distribution of POS Tags by Language")
sns.move_legend(g, "upper left", bbox_to_anchor=(0.9, 1))

# **Topic Modelling: LDA**
**Note**: I wasn't familiar with doing LDA topic modelling this so followed a tutorial to write the below code: https://www.youtube.com/watch?v=b91ohJvEst4

In [None]:
!pip install pandas

In [None]:
master_corpus1 = pd.read_csv("/content/drive/MyDrive/CS521_FinalProject/Resources/master_corpus1_hinglish.csv") # reload master_corpus1 again

In [None]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer

# hinglish stop words we don't want to include in the topic modelling
hindi_sw = ['ka', 'nahi', 'ki', 'hai', 'ke', 'ho', 'kya', 'bhi', 'aa', 'toh', 'ko', 'ye', 'gaya']
hinglish_stop_words = list(text.ENGLISH_STOP_WORDS.union(hindi_sw))

# min_df specifies we want to keep words that show up in at least 5 docs/texts
# max_df specifies we want to keep words that show up in at most 75% or less docs/texts. This helps us exclude common words
tfidf_vectorizer = TfidfVectorizer(stop_words = hinglish_stop_words, min_df = 5, max_df = 0.75)

# tokenize the data, remove stop words, remove most common words, and return what is left
doc_term_matrix = tfidf_vectorizer.fit_transform(master_corpus1['text'])

print(f'Rows: {doc_term_matrix. shape[0]}, Columns: {doc_term_matrix. shape[1]}' )

In [None]:
from sklearn. decomposition import LatentDirichletAllocation

num_topics = 3 # this is the number of topics we are trying to find all the texts
lda_topic_model = LatentDirichletAllocation(n_components = num_topics, random_state = 12345)
doc_topic_matrix = lda_topic_model. fit_transform(doc_term_matrix) # train and get texts' topic assignments
col_names = [f'Topic {x}' for x in range(1, num_topics + 1)]
doc_topic_df = pd.DataFrame(doc_topic_matrix, columns = col_names) # display each document's topic assignments
doc_topic_df.head(n = 10)

# **Topic Modelling Results**

In [None]:
num_words = 10 # the top words we want to display for each topic

# The below code gives us the top words for each topic
for topic, words in enumerate(lda_topic_model. components_):
  word_total = words.sum()
  sorted_words = words.argsort() [ ::- 1]
  print(f'\nTopic {topic + 1:02d}')
  for i in range(0, num_words):
    word = tfidf_vectorizer.get_feature_names_out() [sorted_words[i]]
    word_weight = words[sorted_words[i]]
    print(f'{word} ({word_weight :.3f})')