<a href="https://colab.research.google.com/github/nafisenik/NLP-CA2/blob/main/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import libraries

In [1]:
!pip install ner-d

In [4]:
import os
import pandas as pd
from itertools import groupby
from operator import itemgetter
import re
from collections import Counter
import numpy as np
import tqdm
#from wordcloud import WordCloud
import matplotlib.pyplot as plt
from string import punctuation
import string



from nltk import regexp_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import spacy
from spacy import displacy

from nerd import ner

import datetime

ModuleNotFoundError: No module named 'nerd'

In [None]:
!pip install wordcloud

### Download nltk packages

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

### Read text files and save as a list

In [None]:
# a function to read files 
def read_file(file):
    with open(file, 'r', encoding="utf-8") as f:
        text = f.read()
    return text

# set path to text files
file_dir = './Data/Text_Files/'
text_list = []

# find files with .txt and read them
for file in os.listdir(file_dir):
    if file.endswith(".txt"):
        file_path = f'{file_dir}{file}'
        text_list.append(read_file(file_path))

In [None]:
print(f"number of texts: {len(text_list)}")

### Sample of our data

In [None]:
# sample text
print(f"sample text: {text_list[0][:500]}")

In [None]:
df = pd.read_csv("./Data/metadata.csv")
df.head()

#### Change Date Type to datetime

In [None]:
print(df['Title'].dtypes)

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%B %d, %Y')
df['Date']

### Groupby persidents:
`dict[president_name]` = `[texts]`


In [None]:
president_name = df['President'].tolist()
president_text_dict = {keys: [i for _, i in sub] for keys, sub in groupby(
         zip(president_name, text_list), key = itemgetter(0))}

In [None]:
president_text_dict.keys()

In [None]:
len(president_text_dict['Barack Obama'])

### Tokenizer

In [None]:
# http://stackoverflow.com/questions/36353125/nltk-regular-expression-tokenizer
pattern = r'''(?x)          # set flag to allow verbose regexps
        (?:[a-zA-Z]\.)+(?:[a-zA-Z])?         # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \$?\d+(?:\.\d+)?%?\s?  # currency and percentages, e.g. $12.40, 82%
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''

pattern = re.compile(pattern)

def tokenize_text(text):
    return regexp_tokenize(text, pattern)

### Lemmatizer

In [None]:
def lemmatize_all(sentence):
    l = []
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(tokenize_text(sentence)):
        if tag.startswith('N'):
            l.append(wnl.lemmatize(word, pos='n'))
        elif tag.startswith('V'):
            l.append(wnl.lemmatize(word, pos='v'))
        elif tag.startswith('J'):
            l.append(wnl.lemmatize(word, pos='a'))
        elif tag.startswith('R'):
            l.append(wnl.lemmatize(word, pos='r'))
            
        else:
            l.append(wnl.lemmatize(word, pos='n'))
    return l

#### Test of lemmatizer

In [None]:
sentence_sample = 'The striped bats are  hanging on their feet for best and wrote you are'
print(lemmatize_all(sentence_sample))



### Normalizing and Cleaning

In [None]:
stop_words = set(stopwords.words('english'))
def normalize_and_cleaning(text, lemm=True, remove_punct=True, lower=True, stop_word=True, remove_number=True, min_len = 2):

    text = re.sub(r'(:?\[Laughter\]|\[Applause\]|\[applause\]|\[laughter\])', '', text)
    

    word_tokens = tokenize_text(text)
    
    if lower:
        text = text.lower()
    
    
    if remove_number:
        my_reg = r'\$?\d+\.?\d*\w*\d*'
        text = re.sub(my_reg, '', text)
    
    if remove_punct:
        #text =' '.join([word.strip(punctuation) for word in tokenize_text(text) if len(word) > 1])
        text =' '.join([word.strip(punctuation) for word in tokenize_text(text) if word not in string.punctuation])
    
    if lemm:
        text = ' '.join(lemmatize_all(text))
    
    
    if stop_word:
        text = ' '.join([w for w in tokenize_text(text) if not w.lower() in stop_words])
    

    
    if min_len>1:
        text = ' '.join([word for word in tokenize_text(text) if len(word)>min_len])  

    
    text = re.sub(' +', ' ', text)
    
    return text
    
    

In [None]:
#Test normalize_and_cleaning
test_text ='''[Laughter]poster-print The President. That 1993s Asia—the asia-pacific 22nd 33nd33 6-year U.S.A. wrote good example. [Applause] Well—[applause]. are [Applause] costs $12.40... Michelle Move! u.'''
tokens = tokenize_text(test_text)
res = normalize_and_cleaning(test_text)
res2 = normalize_and_cleaning(test_text,remove_number=False)
print(res)

print(res2)



### Join all text and create our corpuse

In [None]:
all_doc = ' '.join(text_list)
clean_corpuse = normalize_and_cleaning(all_doc)
corpuse_tokens = tokenize_text(clean_corpuse)


### Most frequent words of our corpuse

In [None]:
counter = Counter(corpuse_tokens)
most_occur = counter.most_common(50)
corpuse_fre_df = pd.DataFrame.from_records(most_occur, columns =['word', 'frequency'])
total_freq = [(x[1]/len(corpuse_tokens))*100 for x in most_occur]
corpuse_fre_df['% total frequency'] = total_freq
corpuse_fre_df

### Statistical information

In [None]:
def Statistical_info(text_tokens):
  print (f"Number of words : {len(text_tokens)}")
  print (f"Number of unique  words : {len(set(text_tokens))}")
  word_len_avg = sum(map(len, text_tokens))/float(len(text_tokens))
  print (f"Average word length : {word_len_avg}")
  sent_len_avg = np.mean([len(' '.join(s)) for s in text_tokens])
  max_w = max(text_tokens, key=len)
  min_w = min(text_tokens, key=len)
  print (f"Longest word : {max_w}")
  print (f"Shortest word : {min_w}")

### Word cloud of corpuse

In [None]:
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                min_font_size = 10).generate(clean_corpuse)
 
# plot the WordCloud image                      
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

### Statistical information of corpuse

In [None]:
  print('*' *50)
  print('Statical information of corpus')
  print('*' *50)
  Statistical_info(corpuse_tokens)

###Statistical information for each president

In [None]:
for president, text_list in tqdm.tqdm_notebook(president_text_dict.items()):
  president_texts = (' '.join(text_list))
  clean_text = normalize_and_cleaning(president_texts)
  president_tokens = tokenize_text(clean_text)
  print('*' *50)
  print(president)
  print('*' *50)
  Statistical_info(president_tokens)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
all_p_text =[]
for president, text_list in tqdm.tqdm_notebook(president_text_dict.items()):
  president_texts = (' '.join(text_list))
  clean_text = normalize_and_cleaning(president_texts)
  all_p_text.append(clean_text)



### Clustering

In [None]:
vectorizer = TfidfVectorizer(tokenizer=tokenize_text)
X = vectorizer.fit_transform(all_p_text)
vectorizer.get_feature_names_out()
print(X.shape)
#X = vect.fit_transform(docs) 
count_vect_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())
count_vect_df
#X_df = pd.DataFrame(X)


In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0)
y = kmeans.fit_predict(X)
p_names = list(president_text_dict.keys())
vect_df = pd.DataFrame(p_names, columns = ['President'])
vect_df['Label'] = y

In [None]:
vect_df

In [None]:
final_df = pd.concat([vect_df, count_vect_df], axis=1)
f = final_df.iloc[0:12,:]
f

In [None]:
num_clusters = 3
num_seeds = 3
max_iterations = 300
labels_color_map = {
    0: '#20b2aa', 1: '#ff7373', 2: '#ffe4e1'
}
pca_num_components = 2
tsne_num_components = 2

X1 = X.todense()
reduced_data = PCA(n_components=pca_num_components).fit_transform(X1)
# print reduced_data
dense_df = pd.DataFrame(reduced_data, columns=['X','Y'])
dense_df


In [None]:
dense_df

In [None]:
final_dense_df = pd.concat([vect_df, dense_df], axis=1)
final_dense_df

In [None]:
groups = final_dense_df.groupby('Label')
cluster_names_0 = {0: 'Cluster 0', 1: 'Cluster 1', 2: 'Cluster 2'}
# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
for name, group in groups:
    ax.plot(group.X, group.Y, marker='o', linestyle='', ms=12, 
            label=cluster_names_0[name], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          
        which='both',       #both major and minor ticks are affected
        bottom=False,      
        top=False,         
        labelbottom=False)
    ax.tick_params(\
        axis= 'y',         
        which='both',     
        left=False,      
        top=False,         
        labelleft=False)
    
ax.legend(numpoints=1, loc='upper left')  #show legend with only 1 point

#add label in x,y position with the label as the speech title
for i in range(len(final_dense_df)):
    ax.text(final_dense_df.loc[i]['X'], final_dense_df.loc[i]['Y'], final_dense_df.loc[i]['President'], size=13)  

    
    
plt.show()

NER

In [None]:
def specify_NER(raw_text):
  doc = ner.name(raw_text, language='en_core_web_sm')
  print(doc)
  text_label = [(X.text, X.label_) for X in doc]
  for text, label in text_label:
    if(label != 'GPE'):
      raw_text = raw_text.replace(text, label)
  return raw_text

In [None]:
input_text = "That U.S.A. poster-print e.g the U.S costs $12.40 that created in 2008 by Jeff Atwood and Joel Spolsky."
specify_NER(input_text)

In [None]:
NER = spacy.load("en_core_web_sm")
raw_text = 'That U.S.A. poster-print e.g the U.S costs $12.40'
text1= NER(raw_text)
displacy.render(text1,style="ent",jupyter=True)



In [None]:

s = "That U.S.A. and Nicolas poster-print e.g the U.S costs $12.40"
doc = NER(s)
word_tokens = tokenize_text(s)

a = " ".join([t.text if not t.ent_type_ else t.ent_type_ for t in doc])
print(tokenize_text(a))


In [None]:
l = tokenize_text('That U.S.A. poster-print e.g the U.S costs $12.40 amir. hello')
t = 'That U.S.A. poster-print e.g the U.S costs $12.40 amir. hello'
def punct(text):
    text = ' '.join([t for t in tokenize_text(text) if len(t) > 1])
    text = ' '.join(word.strip(punctuation) for word in tokenize_text(text))
    return text
        
#punct(t) 
import re, string

def test(text):
    out =' '.join([word for word in tokenize_text(text) if word not in string.punctuation])

    return out
print(test(t))
print(punct(t))
#print(l)

In [None]:
l = 'That U.S.A. poster-print e.g the U.S costs $12.40 3th amir. hello 7. 9 2th 2nd2 5-year $56 $4 1890'

def remove_num(t):
    my_reg = r'\$?\d+\.?\d*\w*\d*'
    text = re.sub(my_reg, '', t)
    text = re.sub(' +', ' ', text)
    return text

y = remove_num(l)
print(test(y))


In [None]:
print(stopwords.words('english'))


In [None]:
from nltk.tokenize import word_tokenize
min_len = 2
l = 'That U.S.A. poster-print e.g the U.S cs $12.40 3th am'
t = word_tokenize(l)
if min_len>1:
    text = ' '.join([word for word in t if len(word)>min_len]) 

In [None]:
text

In [None]:
text = '[Laughter]poster-print The President. That 1993s [Applause] Asia—the asia-pacific'
tt = re.sub(r'(:?\[Laughter\]|\[Applause\]|\[applause\]|\[laughter\])', '', text)
tt