In [None]:
## Preprocessing Methods

In [None]:
#Shreyasi Combined

import numpy as np
import pandas as pd

'''
    The order of pre-processing is the following:
    Basic (Lower case, digit removal etc)
    short-form replacer
    stopwords removal
    Lemmatization
    print ngrams 
    and use it to tokenise finally

'''

# ser = data['project description']

def basic(ser)

    # removing everything except alphabets`
    ser = ser.str.replace("[^a-zA-Z0-9-#]", " ")
    ser = ser.str.replace('\d+', '')

    # make all text lowercase
    ser = ser.apply(lambda x: x.lower())

    return ser


def replacer(ser):
    import replacers
    # ser = data['project description']

    from replacers import RegexpReplacer
    replacer = RegexpReplacer()
    for i in range(len(ser)):
        ser[i] = replacer.replace(ser[i])

    return ser


def stopwords(ser):
    from nltk.corpus import webtext
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))

    new_stopwords = ['using', 'used', 'develop', 'developed', 'study', 'studied', 'given', 'review', 'value', 'done',
                     'performed', 'implement', 'implementation', 'application',
                     'guide', 'prof', 'predict', 'technology', 'jupyter', 'notebook', 'achieved', 'different',
                     'technique', 'create', 'created', 'python',
                     'implemented', 'worked', 'code', 'google', 'colab', 'trained', 'technologies', 'proposed',
                     'performed', 
                     'build', 'built', 'technology', 'implemented', 'worked']

    new_stopwords = []

    new_stopwords_list = list(stop_words.union(new_stopwords))

    from nltk.tokenize import MWETokenizer
    tokenizer = MWETokenizer()

    tokenized_doc = ser.apply(lambda x: tokenizer.tokenize(x.split()))
    # remove stop-words
    tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in new_stopwords_list])

    return tokenized_doc


def lemmatizer(tokenized_doc):
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    for doc in tokenized_doc:
        for i in range(len(doc)):
            doc[i] = lemmatizer.lemmatize(str(doc[i]))

    return tokenized_doc


def print_ngrams(tokenized_doc):
    import itertools
    words = list(itertools.chain.from_iterable(tokenized_doc))

    from nltk.collocations import BigramCollocationFinder
    from nltk.metrics import BigramAssocMeasures

    bcf = BigramCollocationFinder.from_words(words)

    # filter_stops = lambda w: len(w) < 3 or w in new_stopwords_list
    filter_stops = lambda w: w in new_stopwords_list
    bcf.apply_word_filter(filter_stops)

    from nltk.collocations import TrigramCollocationFinder
    from nltk.metrics import TrigramAssocMeasures

    tcf = TrigramCollocationFinder.from_words(words)
    tcf.apply_word_filter(filter_stops)
    tcf.apply_freq_filter(3)

    bigrams = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 20)
    trigrams = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 20)
    ngrams = bigrams + trigrams
    return ngrams


def tokenize_ngram(ngrams, tokenized_doc):
    from nltk.tokenize import MWETokenizer
    tokenizer = MWETokenizer(ngrams)

    # de-tokenization
    detokenized_doc = []
    for i in range(len(detokenized_doc)):
        t = ' '.join(tokenized_doc[i])
        detokenized_doc.append(t)

    data = detokenized_doc

    tokenized_doc = data.apply(lambda x: tokenizer.tokenize(x.split()))

    # de-tokenization
    detokenized_doc = []
    for i in range(len(data)):
        t = ' '.join(tokenized_doc[i])
        detokenized_doc.append(t)

    data = detokenized_doc

    return data

In [None]:
# Smayan-1

# A Doc object’s doc.noun_chunks property allows us to iterate over the noun chunks in the document. 
# A noun chunk is a phrase that has a noun as its head. 

# Example for the phrase " A noun chunk is a good phrase that has a noun as its head"

# 1. A noun chunk
# 2. a good phrase
# 3. a noun
# 4. its head

# Best lead so far.
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_lg")

df = pd.read_csv("data_final.csv")
df.columns = ["project"]
rawlist = list(df.project)

items = len(rawlist)

cleaned_documents = []

for item in range(items):
    print(f"Project {item}")
    text = rawlist[item]
    doc = nlp(text)
    phrase = ""
    for chunk in doc.noun_chunks:
        phrase = phrase + " " + str(chunk)
#         print(phrase)
#         print(type(phrase))
        print(chunk)
#         print(type(chunk))
    cleaned_documents.append(phrase)
    print("\n")
df= pd.DataFrame(cleaned_documents)
df.to_csv("data_final_spacy.csv", mode = 'a', header = False)

In [None]:
# Smayan-2

## Basic Analysis

# Data Loading
import pandas as pd
df = pd.read_csv("/content/data_final.csv")
df.head()
len(df)

labels = df["label"].tolist()
labels[:10]

from nltk.corpus import stopwords
nltk.download('stopwords')

documents = df['project'].tolist()

from nltk.corpus import stopwords
stoplist = stopwords.words('english')

cleaned_documents = []

for document in documents:
    text = document
    print(text)
    clean_word_list = [word for word in text.split() if word not in stoplist]
    print(clean_word_list)
    cleaned_text = ' '.join(clean_word_list)
    cleaned_documents.append(cleaned_text)

documents = cleaned_documents
corpus = " ".join(documents).lower()
print(documents)

# Data Cleaning- Removing punctuation and digits

print(documents)
import string

def clean_text(corpus):
    # Remove punctuations from the corpus
    translator = str.maketrans('', '', string.punctuation)
    corpus = corpus.translate(translator)

    # Remove digits from the corpus
    remove_digits = str.maketrans('', '', string.digits)
    corpus = corpus.translate(remove_digits)
    return corpus
corpus = clean_text(corpus=corpus)
corpus[:1000]


# Data Analysis using Spacy:

import spacy
nlp = spacy.load('en_core_web_lg')

# Finding size ofnlp.max_length_length w.r.t max_length of spacy model
len(corpus) / nlp.max_length

# Spliting corpus to match the max length of spacy
corpus_chunk = [corpus[i:i+nlp.max_length] for i in range (0,len(corpus), nlp.max_length)]

# Parse each chunk of corpus with spacy
docs = []
for chunk in corpus_chunk:
    doc = nlp(chunk)
    docs.append(doc)
    
# Named Entity Recognition

words = []
nouns = []
verbs = []
people = []
orgs = []

for doc in docs:
    for token in doc:
        words.append(token.text)
        if token.pos_ == "VERB":
            verbs.append(token.text)
            
        if token.pos_ == "NOUN":
            nouns.append(token.text)
# Finding organizations and people
for doc in docs:
    for ent in doc.ents:
        if ent.label_ == "ORG":
            orgs.append(ent.text)
        if ent.label_ == "PERSON":
            people.append(ent.text)
            
print("Total words: ", len(words))
print("Total verbs: ", len(verbs))
print("Total nouns: ", len(nouns))
print("Total people: ", len(people))
print("Total organizations: ", len(orgs))

from collections import Counter
from collections import OrderedDict

def sort_by_frequency(data,reverse = True):
    """ 
    Function to sord the data by its frequency
    Returns ordered dictionary
    Default: Sort in descending order
        
    """
    data_with_freq = dict(Counter(data))
    data_sorted_by_freq = OrderedDict(sorted(data_with_freq.items(), key=lambda x: x[1],reverse=reverse))
    
    return data_sorted_by_freq

words_frequency = sort_by_frequency(words)
nouns_frequency = sort_by_frequency(nouns)
verbs_frequency  = sort_by_frequency(verbs)
people_frequency = sort_by_frequency(people)
orgs_frequency = sort_by_frequency(orgs)

words_frequency


## N-Gram Analysis
import nltk

def create_bigram(tokens):
    # Using words token generated from spacy to find bigram
    bigrams_ = nltk.bigrams(tokens)
    # Convert generator into list of tuples of bigram 
    return list(bigrams_)

bigrams_list = create_bigram(words)
bigrams = [" ".join(bigram) for bigram in list(bigrams_list)]

bigrams[:10]

# Sorting bigram by frequency
bigram_frequency = sort_by_frequency(bigrams)

def get_top_n_from_order_dict(ordered_dict,n):
    """ Function to find n top object from ordered dictionary"""
    return [list(ordered_dict.items())[i] for i in range(n)]

# Finding top 25 bigram
top_bigram = get_top_n_from_order_dict(bigram_frequency,25)
top_bigram

#3Visualizing the top bigrams

import numpy as np
import matplotlib.pyplot as plt

def visualize_list_of_tuple(data):
    """"""
    list1, list2 = zip(*data)
    y_pos = np.arange(len(list1))
    plt.barh(y_pos, list2, align='center', alpha=0.5)
    plt.yticks(y_pos, list1)
    plt.show()
    
##Finding unique nouns,verbs, people, name and organization

unique_nouns = set(nouns)
unique_verbs = set(verbs)
unique_people = set(people)
unique_orgs = set(orgs)            

# Both words Nouns
def get_noun_noun_bigram(bigrams):
    "Find bigram with both word noun"
    NN_bigrams = []
    for first_word,second_word in bigrams:
        if first_word in unique_nouns and second_word in unique_nouns:
            NN_bigrams.append(" ".join((first_word,second_word)))
    return NN_bigrams
NN_bigrams = get_noun_noun_bigram(bigrams_list)
NN_bigrams

# First word verb
def get_bigram_starting_with_verb(bigrams):
    V_bigrams = []
    for first_word,second_word in bigrams:
        if first_word in unique_verbs:
            V_bigrams.append(" ".join((first_word,second_word)))
            
    return V_bigrams
V_bigrams = get_bigram_starting_with_verb(bigrams_list)
V_bigrams

# First word organization 
def get_bigram_starting_with_organization(bigrams):
    org_bigrams = []
    for first_word,second_word in bigrams:
        if first_word in unique_orgs:
            org_bigrams.append(" ".join((first_word,second_word)))
    return org_bigrams
org_bigrams = get_bigram_starting_with_organization(bigrams_list)
org_bigrams

# First word organization 
def get_bigram_starting_with_person(bigrams):
    people_bigrams = []
    for first_word,second_word in bigrams:
        if first_word in unique_people:
            people_bigrams.append(" ".join((first_word,second_word)))
    return people_bigrams
people_bigrams = get_bigram_starting_with_person(bigrams_list)
people_bigrams

# Sorting the bigrams by frequency.

NN_bigrams_frequency = sort_by_frequency(NN_bigrams)
V_bigrams_frequency  = sort_by_frequency(V_bigrams)
people_bigrams_frequency = sort_by_frequency(people_bigrams)
org_bigrams_frequency = sort_by_frequency(org_bigrams)

# Finding top 26 bigrams with both nouns
top_25_NN_bigram = get_top_n_from_order_dict(NN_bigrams_frequency,25)
visualize_list_of_tuple(top_25_NN_bigram)