<a href="https://colab.research.google.com/github/mkane968/Text-Mining-with-Student-Papers/blob/main/notebooks/Preprocessing_and_Basic_Text_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Install Packages

In [None]:
#Imports the Natural Language Toolkit, which is necessary to install NLTK packages and libraries
#!pip install nltk
import nltk

#Installs libraries and packages to tokenize text
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

#Installs libraries and packages to clean text
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

#Installs libraries and packages to stem and lemmatize texts
from nltk.stem.snowball import SnowballStemmer # This is "Porter 2" and is considered the optimal stemmer.
from nltk.stem import (PorterStemmer, LancasterStemmer)
nltk.download('wordnet')
from nltk import WordNetLemmatizer
nltk.download('omw-1.4')

#Installs NLTK libraries and packages to perform chunking, parsing and visualization
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
!pip install svgling

#Imports spaCy itself, necessary to use features 
#!pip install spaCy
import spacy
#Load the natural language processing pipeline
nlp = spacy.load("en_core_web_sm")
#Load spaCy visualizer
from spacy import displacy

## Load TSV into DataFrame



In [None]:
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Selet tsv file to upload from local folder
from google.colab import files

uploaded = files.upload()


In [None]:
#Add file into dataframe
import pandas as pd
import io

df = pd.read_csv(io.BytesIO(uploaded['test_submissions.tsv']), index_col=0, sep=',')
df.head()

## Preprocessing

In [None]:
#Lowercase all words
df['Text'] = df['Text_NoHeaders'].str.lower()

#Remove punctuation and replace with no space (except periods and hyphens)
df['Text'] = df['Text'].str.replace(r'[^\w\-\.\'\s]+', '', regex = True)

#Remove periods and replace with space (to prevent incorrect compounds)
df['Text'] = df['Text'].str.replace(r'[^\w\-\'\s]+', ' ', regex = True)
df.head()

df.head()

In [None]:
#Remove stopwords
stop_words = set(stopwords.words("english"))
df['no_stops'] = df['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

df.head()

# Text Analysis and Enrichment

In [None]:
#Get word count of each text
df['Length'] = df['Text_NoHeaders'].apply(lambda x: len(x))
df.head()

In [None]:
#Graph portfolio grade by length
import matplotlib.pyplot as plt

df = df.sort_values(by=['Portfolio Score'], ascending=True)

df.plot(kind='bar',x='Portfolio Score',y='Length')

In [None]:
#Get lemmas
lemma_list = []

# Disable Dependency Parser, and NER since all we want is POS 
with nlp.disable_pipes('parser', 'ner'):
  #Iterate through each doc object and tag POS, append POS to list
  for doc in nlp.pipe(df.Text.astype('unicode').values, batch_size=100):
    word_list = []
    for token in doc:
        word_list.append(token.lemma_)
        
    lemma_list.append(word_list)

#Make pos list a new column in dataframe
df['lemma_list'] = lemma_list
df.head()

In [None]:
#Get part of speech tags
pos_list = []

# Disable Dependency Parser, and NER since all we want is POS 
with nlp.disable_pipes('parser', 'ner'):
  #Iterate through each doc object and tag POS, append POS to list
  for doc in nlp.pipe(df.Text.astype('unicode').values, batch_size=100):
    word_list = []
    for token in doc:
        word_list.append(token.pos_)
        
    pos_list.append(word_list)

#Make pos list a new column in dataframe
df['pos_list'] = pos_list

#Check pos tags
df.head()

In [None]:
#Get dependency parsing for single doc and visualize
doc = nlp(df.Text_NoHeaders[0]) 
print(doc)

displacy.render(doc, style="dep", jupyter=True)

In [None]:
#Get named entities
ent_list = []

with nlp.disable_pipes('tagger', 'parser'):
    for doc in nlp.pipe(df.Text.astype('unicode').values, batch_size=100):
        ent_list.append(doc.ents)

df['ent_list'] = ent_list

#Check named entities
df.head()

In [None]:
#Get named entities in a single document and visualize
doc = nlp(df.Text[0]) 

displacy.render(doc, style="ent", jupyter=True)