# Paper Grading Assistant

## Data Wrangling and Pre-processing

Data comes from these links:
- https://components.one/datasets/all-the-news-2-news-articles-dataset/
- https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus
- https://www.kaggle.com/krsoninikhil/pual-graham-essays
- https://www.kaggle.com/c/asap-sas/data
- https://www.kaggle.com/c/asap-aes/data
- https://www.kaggle.com/thevirusx3/automated-essay-scoring-dataset

In [7]:
# !pip install gensim
import os, sys
from gensim import corpora, models
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
# from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maxw2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maxw2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
train_docs = {
    'doc1' : "D:\\Kaggle\\asap-sas\\train.tsv",
    'doc2' : "D:\\Kaggle\\asap-aes\\training_set_rel3.tsv",
    'doc3' : "D:\\Kaggle\\paul-graham-essays\\paul_graham_essay.txt",
    'doc4' : "D:\\Kaggle\\all-the-news-2-1\\all-the-news-2-1.csv", # large file
    'doc5' : "gibberish" # bad input
} 

In [9]:
# Cleaning the text

def get_data(path):
    dataset = []
    
    if os.path.getsize(path) > 1 * 10^9:
        dataset = clean_big_file(path)
        return dataset
    if path.endswith('.tsv'):
        dataset = pd.read_table(path)
    elif path.endswith('.csv'):
        dataset = pd.read_csv(path)
    elif path.endswith('.txt'):
        with open(path) as file:
            for line in file:
                dataset.append(line.rstrip())
        file.close()
    else:
        dataset = ''
    return (dataset)

def strip_html(raw_html):
    clean_re = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    text = re.sub(clean_re, '', raw_html)
    return text

def lem_stem_text(text):
    # remove handles and urls specifically
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    # remove anything not a letter
    text = re.sub('[^a-zA-Z]', ' ', text)

    text = text.lower()
    text = text.split()
    
    ps = PorterStemmer()
    wnl = WordNetLemmatizer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    
    text1 = [ps.stem(word) for word in text if not word in set(all_stopwords)]
    text2 = [wnl.lemmatize(word) for word in text if not word in set(all_stopwords)]
    text1 = ' '.join(text1)
    text2 = ' '.join(text2)
    return text1, text2
    
def clean_big_file(file):
    data = []
    count = 0
    
    import csv
 
    with open(file, newline='', encoding='utf-8') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        
#         for line in csv_reader:
#             cols.append(line)
#             break
 
#         print(cols[0])
    
#         target = input("Which column has the text? Copy and paste here: ")
#         print(target)
        
#         idx = cols[0].index(target)
        
        try:
            for line in csv_file:
                if count < 500:
                    data.append(line)
                    count += 1
                else:
                    break
        except:
            print("Unexpected error:", sys.exc_info())
            count += 1
            pass
    csv_file.close()
    header = data.pop(0)
    return data
    
def clean_df(df):
    cols = df.columns
    print(cols)
    target = input("Which column has the text? Copy and paste here: ")
    print(target)
    clean_text = []
    for i in range(len(df)):
        try:
            text = strip_html(df[target][i])
            text1, text2 = lem_stem_text(text)
            clean_text.append([text1, text2])
        except:
            pass
#     print(clean_text)
    return clean_text

def clean_list(lst):
    print('processing text data...')
    clean_text = []
    for i in range(len(lst)):
        try:
            lst[i] = lst[i].strip()
            if len(lst[i]) < 2: # removes random empty lines
                lst.pop(i)
                continue
            text = strip_html(lst[i])
            text1, text2 = lem_stem_text(text)
            clean_text.append([text1, text2])
        except IndexError:
            break
        except:
            print("Unexpected error:", sys.exc_info())
            pass
#     print(clean_text)
    return clean_text

def process_data(data):
    if isinstance(data, pd.DataFrame):
        return clean_df(data)
    elif isinstance(data, list):
        return clean_list(data)
    else:
        print('data type not recognized')
        return ''
    

In [10]:
data = clean_big_file("D:\\Kaggle\\all-the-news-2-1\\all-the-news-2-1.csv")
data = clean_list(data)
data

processing text data...


[['lee drutmanw take concern health liber democraci seriouslythi post part polyarchi independ blog produc polit reform program new america washington think tank devot develop new idea new voic imagin otherwis healthi someth start feel weird sometim short breath get migrain feet start swell littl otherwis everyth seem fine go doctor doctor run test tell youit probabl noth could sign come heart attack push certainti doctor tell she not sure human bodi complex system your young otherwis pretti healthi could plenti explan your feel littl worri safe side mayb reduc stress life eat healthier diet would your sensibl person youd probabl err side precaut sure might noth worri likelihood heart attack might low even low chanc low chanc someth possibl fatal take chanc especi recommend less stress healthier diet good either way offer parabl way think debat that emerg past two week respons amanda taub new york time articl profil new find roberto stefan foa yascha mounk find rais alarm fact younger p

In [11]:
all_data = []
for key in train_docs.keys():
    try:
        data = get_data(train_docs[key])
        big_data = process_data(data)
        all_data.append(big_data)
    except:
        print("Unexpected error: ", sys.exc_info())

processing text data...
Unexpected error: (<class 'UnicodeDecodeError'>, UnicodeDecodeError('utf-8', b's top doctors that too much time spent on the computer can cause damage to your health. Exercise is highly stressed by doctors around the globe and the only exercise a computer addict will be doing is typing up a storm. They can submerge themselves into obesity. This often causes depression as it begins to sink in. Addition itself is a serious health issue. I wouldn\'t be surprised if they add a computer wing to rehab centers sometime in the near future. These problems are slowly spreading and we must come together to abolish them! Family time is an treasured value, to the @CAPS2 people, but lately, computers are ruining this tradition. With so @CAPS1 people absorbed with online drama, friends, and even video games, family interaction is declining. People slowly remove themselves from reality and enter an alternate world of cyberspace. They @MONTH1 only emerge from their rooms to use 

In [12]:
all_data[0]

[['addit inform would need replic experi much vinegar place ident contain tool use measur mass four differ sampl much distil water use rins four sampl take vinegar',
  'additional information would need replicate experiment much vinegar placed identical container tool use measure mass four different sample much distilled water use rinse four sample taking vinegar'],
 ['read expir realiz addit inform need replic expirei one amant vinegar pour contain two label contain start yar expir three write conclus make sure yar result accur',
  'reading expirement realized additional information need replicate expireiment one amant vinegar poured container two label container start yar expirement three write conclusion make sure yar result accurate'],
 ['need trial control set exact amount vinegar pour cupbeak could also take check mass everi min hour',
  'need trial control set exact amount vinegar pour cupbeaker could also take check mass every min hour'],
 ['student list rock better rock wors p

In [13]:
all_data[1]

[['dear local newspap think effect comput peopl great learn skillsaffect give us time chat friendsnew peopl help us learn globeastronomi keep us trobl thing dont think would feel teenag alway phone friend ever time chat friend buis partner thing well there new way chat comput plenti site internet organ organ cap facebook myspac ect think set meet boss comput teenag fun phone not rush get caus want use learn countrysst outsid well computerinternet new way learn go time might think child spend lot time comput ask question economi sea floor spread even date youll surpris much hesh know believ not comput much interest class day read book child home comput local librari better friend fresh perpressur someth know isnt right might not know child cap forbidd hospit bed drivebi rather child comput learn chat play game safe sound home commun place hope reach point understand agre comput great effect child give us time chat friendsnew peopl help us learn globe believ not keep us trobl thank liste

In [14]:
all_data[2]

[['fma exampl gener surpris hard', 'fma example general surprising hard'],
 ['combin achiev territori tend pick',
  'combination achieve territory tends picked'],
 ['clean precis insight valuabl', 'clean precisely insight valuable'],
 ['either surpris without gener eg', 'either surprising without general eg'],
 ['gossip gener without surpris eg', 'gossip general without surprising eg'],
 ['platitud', 'platitude'],
 ['insight get small addit whichev', 'insight get small addition whichever'],
 ['qualiti miss common case small', 'quality missing common case small'],
 ['addit gener piec gossip that', 'addition generality piece gossip thats'],
 ['gossip teach someth interest', 'gossip teach something interesting'],
 ['world anoth less common approach focu',
  'world another le common approach focus'],
 ['gener idea see find someth new', 'general idea see find something new'],
 ['say start gener', 'say start general'],
 ['need small delta novelti produc use',
  'need small delta novelty prod

In [15]:
all_data[3]

[['lee drutmanw take concern health liber democraci seriouslythi post part polyarchi independ blog produc polit reform program new america washington think tank devot develop new idea new voic imagin otherwis healthi someth start feel weird sometim short breath get migrain feet start swell littl otherwis everyth seem fine go doctor doctor run test tell youit probabl noth could sign come heart attack push certainti doctor tell she not sure human bodi complex system your young otherwis pretti healthi could plenti explan your feel littl worri safe side mayb reduc stress life eat healthier diet would your sensibl person youd probabl err side precaut sure might noth worri likelihood heart attack might low even low chanc low chanc someth possibl fatal take chanc especi recommend less stress healthier diet good either way offer parabl way think debat that emerg past two week respons amanda taub new york time articl profil new find roberto stefan foa yascha mounk find rais alarm fact younger p