# Summerizer

In [9]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import sent_tokenize      #common
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

nltk.download('punkt') 
nltk.download('stopwords') 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<b>Looking at the chat data</b>

In [4]:
df = pd.read_csv("data.csv", index_col=0)    #common
df.head(2)

Unnamed: 0_level_0,message
id,Unnamed: 1_level_1
1,"Hey,\n\nWhatsApp Messenger is a fast, simple a..."
2,Senior Analytics Manager - Aasaanjobs.pdf\nYou...


<b>Tokenizing Sentences</b>

In [0]:
sentences = [sent_tokenize(x) for x in df['message']]
sentences = [y for x in sentences for y in x] # flatten list

In [11]:
print("{0} unread chats found with total of {1} sentences in it.".format(df.shape[0], len(sentences)))

7 unread chats found with total of 22 sentences in it.


<b>Download glove.6B.zip from <a href='http://nlp.stanford.edu/data/glove.6B.zip'>here</a></b>

In [8]:
!wget http://nlp.stanford.edu/data/glove.6B.zip  #once
!unzip glove*.zip

--2019-03-26 18:29:53--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-03-26 18:29:53--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-03-26 18:32:03 (6.35 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


<b>Extract word vectors</b>

In [0]:
# Extract word vectors #common
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

<b>Removing punctuations, numbers, special characters and stop words</b>

In [0]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [14]:
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
# function to remove stopwords #once
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

# remove stopwords from the sentences  #common
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [0]:
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

<b>Cosine similarity of sentences</b>

In [0]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [0]:
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

<b>Converting similarities into graph form</b>

In [0]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

<b>Colors</b>

In [20]:
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'



<b>Output</b>

In [31]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

# Chat Analytics
print(color.RED+"{0}".format("Atharv")+color.END+" has {0} chats".format(6))
print(color.RED+"{0}".format("Krunal")+color.END+" has {1} chats".format("Krunal", 16))
print("\nTotal "+color.GREEN+"{0}".format(22)+color.END+" chat messages received, with {0} participants\n".format(2))
print(color.BOLD+"Summary:\n"+color.END)
for i in range(7):
  print(ranked_sentences[i][1])

[91mAtharv[0m has 6 chats
[91mKrunal[0m has 16 chats

Total [92m22[0m chat messages received, with 2 participants

[1mSummary:
[0m
Everything is totally free....

After completion of this Training studants can get direct entry to Diploma Engineering in eligible stream

:pray:my humble request to you, if you find good students with financial poor family in our society ( Eg.Our maid's childrens) please communicate below advertisement to them or give below mentioned cell no.
Senior Analytics Manager - Aasaanjobs.pdf
You can even refer your friends for this role.
We are searching such students who are interested in Technical career and due to family’s financial problems they couldn't continue their career.
Please fill up this form, it would help us a lot.
Hey,

WhatsApp Messenger is a fast, simple and secure app that I use to message and call the people I care about.
SIEMENS India has inaugurated a new " SIEMENS Technical Acadamy" at Airoli , Mumbai for those students who want to d