# Load Packages

In [9]:
import pandas as pd
import numpy as np
import json
import re
import csv
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
import ast
import os
from os import path
from pandas.plotting import table
from google.colab import output
from nltk.stem import PorterStemmer
from itertools import chain
from collections import Counter
import operator
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
import string
from google.colab import files
output.clear()

%matplotlib inline
pd.set_option('display.max_colwidth', 300)

# Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [11]:
load_path = '/content/drive/MyDrive/NLP_Project/data/sentence_broken/'
load_path2 = '/content/drive/MyDrive/NLP_Project/data/word_broken/'

sentence_level_df = pd.read_pickle(load_path + 'sentence_level_df.pkl')
# sentence_level_df = sentence_level_df.drop(sentence_level_df.columns[[0]], axis = 1).reset_index(drop=True)

word_level_df = pd.read_pickle(load_path2 + 'word_level_df.pkl')
# word_level_df = word_level_df.drop(word_level_df.columns[[0]], axis = 1).reset_index(drop=True)

# Parts

### Common and non common words between tags

In [12]:
individual_distinct_words = []
others_ditinct_words = []
common_words = []

all_groups = word_level_df['Words'].values
word_counter = Counter(chain(*all_groups))
list_all_words = list(set(word_counter.elements()))

for i, group in enumerate(all_groups):
  
  current_group = set(group)
  other_groups = set(np.concatenate(np.delete(all_groups, i, axis = 0)))

  print(i)
  print(len(current_group))
  print(len(other_groups))

  individual_distinct = []
  others_distinct = []
  common = []

  for word in list_all_words:
    if word in current_group and word in other_groups:
      common.append(word)

    elif word in current_group:
      individual_distinct.append(word)
    
    elif word in other_groups:
      others_distinct.append(word)


  print(len(individual_distinct))
  print(len(others_distinct))
  print(len(common))
  
  individual_distinct_words.append(individual_distinct)
  others_ditinct_words.append(others_distinct)
  common_words.append(common)

output.clear()

In [None]:
for i, value in enumerate(zip(individual_distinct_words, others_ditinct_words, common_words)):
  individual_distinct, others_distinct, common = value
  genre = word_level_df['Genre'][i]
  x = [genre , "Others", "Common"]
  y = [len(individual_distinct), len(others_distinct), len(common)]

  plt.figure(figsize=(8,10)) 
  ax = sns.barplot(x = x, y = y) 
  ax.set_ylabel('Word Count')
  # ax.set_xlabel('Unique Count')
  plt.savefig("common-chart-" + genre + ".png", bbox_inches='tight')
  files.download("common-chart-" + genre + ".png")
  plt.show()

output.clear()

### Top 10 unique words of each tag

In [None]:
all_distinct_counter = []

for i, value in enumerate(zip(all_groups, common_words)):
  print(i)
  current_group, common = all_groups[i], common_words[i]
  current_group = current_group.tolist()

  current_counter = Counter(current_group)
  
  common_dict = {word: current_counter[word] for word in common}
  print(len(list(set(current_counter.elements()))))

  current_counter.subtract(common_dict)
  print(len(list(set(current_counter.elements()))))
  print(len(individual_distinct_words[i]))
  print(current_counter.most_common(10))

  all_distinct_counter.append(current_counter)

  # plot chart
  top_list = current_counter.most_common(10)
  x_labels = [val[0] for val in top_list]
  y_labels = [val[1] for val in top_list]
  plt.figure(figsize=(12, 6))
  ax = pd.Series(y_labels).plot(kind='bar')
  ax.set_xticklabels(x_labels)

  rects = ax.patches

  for rect, label in zip(rects, y_labels):
      height = rect.get_height()
      ax.text(rect.get_x() + rect.get_width()/2, height, label, ha='center', va='bottom')
  
  plt.savefig("uncommon-chart-" + word_level_df['Genre'][i] + ".png", bbox_inches='tight')
  files.download("uncommon-chart-" + word_level_df['Genre'][i] + ".png")
  plt.show()

  # draw table
  df = pd.DataFrame(top_list, columns=["Word", "Count"])
  ax = plt.subplot(111, frame_on=False) # no visible frame
  ax.xaxis.set_visible(False)  # hide the x axis
  ax.yaxis.set_visible(False)  # hide the y axis

  table(ax, df, loc='center')  # where df is data frame

  plt.savefig("uncommon-table-" + word_level_df['Genre'][i] + ".png", bbox_inches='tight')
  files.download("uncommon-table-" + word_level_df['Genre'][i] + ".png")

  plt.show()
  # break

  output.clear()
output.clear()


### Top 10 common words for each tag

In [17]:
all_relative_normalized_frequency = []

for i, group in enumerate(all_groups):
  current_group = group.tolist()
  other_groups = np.concatenate(np.delete(all_groups, i, axis = 0))#.tolist()

  current_counter = Counter(current_group)
  other_counter = Counter(other_groups)

  current_words = len(list(current_counter))
  other_words = len(list(other_counter))

  top_list = [(word, (current_counter[word]/current_words) / (other_counter[word]/ other_words)) for word in common_words[i]]
  top_list = list(sorted(top_list, key=lambda tup: tup[1], reverse=True))[:10]

  all_relative_normalized_frequency.append(top_list)
  print(top_list)

  # plot chart
  x_labels = [val[0] for val in top_list]
  y_labels = [val[1] for val in top_list]
  plt.figure(figsize=(12, 6))
  ax = pd.Series(y_labels).plot(kind='bar')
  ax.set_xticklabels(x_labels)

  rects = ax.patches

  for rect, label in zip(rects, y_labels):
      height = rect.get_height()
      ax.text(rect.get_x() + rect.get_width()/2, height, label, ha='center', va='bottom')
  
  plt.savefig("RelativeNormalizedFrequency-chart-" + word_level_df['Genre'][i] + ".png", bbox_inches='tight')
  files.download("RelativeNormalizedFrequency-chart-" + word_level_df['Genre'][i] + ".png")
  plt.show()

  # draw table
  df = pd.DataFrame(top_list, columns=["Word", "Relative Normalized Frequency"])
  ax = plt.subplot(111, frame_on=False) # no visible frame
  ax.xaxis.set_visible(False)  # hide the x axis
  ax.yaxis.set_visible(False)  # hide the y axis

  table(ax, df, loc='center')  # where df is data frame

  plt.savefig("RelativeNormalizedFrequency-table-" + word_level_df['Genre'][i] + ".png", dpi=500, bbox_inches='tight')
  files.download("RelativeNormalizedFrequency-table-" + word_level_df['Genre'][i] + ".png")
  plt.show()


  output.clear()
output.clear()

### TF-IDF

In [None]:
all_inputs = sentence_level_df['Description'].values.tolist()

In [None]:
all_inputs = [ np.concatenate(x).tolist() for x in all_inputs]
all_inputs = [ " ".join(x) for x in all_inputs]

In [None]:
cv=CountVectorizer() 
word_count_vector=cv.fit_transform(np.asarray(all_inputs))

In [None]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [None]:
count_vector=cv.transform(all_inputs) 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [None]:
feature_names = cv.get_feature_names() 
 
all_tfidf = []

for i in range(tf_idf_vector.shape[0]):
  #get tfidf vector for first document 
  first_document_vector = tf_idf_vector[i]
  
  #print the scores 
  df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
  df.sort_values(by=["tfidf"],ascending=False, inplace=True)

  all_tfidf.append(df)
  # top_list = df.iloc[:10]

  # draw table
  df = df.iloc[:10]
  ax = plt.subplot(111, frame_on=False) # no visible frame
  ax.xaxis.set_visible(False)  # hide the x axis
  ax.yaxis.set_visible(False)  # hide the y axis

  table(ax, df, loc='center')

  plt.savefig("tfidf-table-" + word_level_df['Genre'][i] + ".png", bbox_inches='tight')
  files.download("tfidf-table-" + word_level_df['Genre'][i] + ".png")

  plt.show()
  print("############")
  print(word_level_df['Genre'][i])
  print(all_tfidf[i].iloc[:10])

  output.clear()
output.clear()

### Word frequency histogram

In [None]:
threshold = 20

all_groups = word_level_df['Words'].values.tolist()

for i in range(len(all_groups)):

  word_counter = Counter(all_groups[i])

  top_list = word_counter.most_common(threshold)

  # plot chart
  x_labels = [val[0] for val in top_list]
  y_labels = [val[1] for val in top_list]
  plt.figure(figsize=(12, 6))
  ax = pd.Series(y_labels).plot(kind='bar')
  ax.set_xticklabels(x_labels)

  rects = ax.patches

  for rect, label in zip(rects, y_labels):
      height = rect.get_height()
      ax.text(rect.get_x() + rect.get_width()/2, height, label, ha='center', va='bottom')
  
  plt.savefig("word-frequency-chart-" + word_level_df['Genre'][i] + ".png", bbox_inches='tight')
  files.download("word-frequency-chart-" + word_level_df['Genre'][i] + ".png")
  plt.show()

  # draw table
  df = pd.DataFrame(top_list, columns=["Word", "Relative Normalized mFrequency"])
  ax = plt.subplot(111, frame_on=False) # no visible frame
  ax.xaxis.set_visible(False)  # hide the x axis
  ax.yaxis.set_visible(False)  # hide the y axis

  table(ax, df, loc='center') 

  plt.savefig("word-frequency-table-" + word_level_df['Genre'][i] + ".png", dpi=500, bbox_inches='tight')
  files.download("word-frequency-table-" + word_level_df['Genre'][i] + ".png")
  plt.show()

  output.clear()
output.clear()