# Paper Grading Assistant

## Data Wrangling and Pre-processing

Data comes from these links:
- https://components.one/datasets/all-the-news-2-news-articles-dataset/
- https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus
- https://www.kaggle.com/krsoninikhil/pual-graham-essays
- https://www.kaggle.com/c/asap-sas/data
- https://www.kaggle.com/c/asap-aes/data
- https://www.kaggle.com/thevirusx3/automated-essay-scoring-dataset

In [1]:
# !pip install gensim
import os, sys
from gensim import corpora, models
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
# from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maxw2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maxw2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
train_docs = {
    'doc1' : "D:\\Kaggle\\asap-sas\\train.tsv",
    'doc2' : "D:\\Kaggle\\asap-aes\\training_set_rel3.tsv",
    'doc3' : "D:\\Kaggle\\paul-graham-essays\\paul_graham_essay.txt",
    'doc4' : "D:\\Kaggle\\all-the-news-2-1\\all-the-news-2-1.csv", # large file
    'doc5' : "gibberish" # bad input
} 

In [13]:
# Cleaning the text

def get_data(path):
    dataset = []
    
    if os.path.getsize(path) > 1 * 10^9:
        dataset = clean_big_file(path)
        return dataset
    if path.endswith('.tsv'):
        dataset = pd.read_table(path)
    elif path.endswith('.csv'):
        dataset = pd.read_csv(path)
    elif path.endswith('.txt'):
        with open(path) as file:
            for line in file:
                dataset.append(line.rstrip())
        file.close()
    else:
        dataset = ''
    return (dataset)

def strip_html(raw_html):
    clean_re = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    text = re.sub(clean_re, '', raw_html)
    return text

def lem_stem_text(text):
    # remove handles and urls specifically
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    # remove anything not a letter
    text = re.sub('[^a-zA-Z]', ' ', text)

    text = text.lower()
    text = text.split()
    
    ps = PorterStemmer()
    wnl = WordNetLemmatizer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    
    text1 = [ps.stem(word) for word in text if not word in set(all_stopwords)]
    text2 = [wnl.lemmatize(word) for word in text if not word in set(all_stopwords)]
    text1 = ' '.join(text1)
    text2 = ' '.join(text2)
    return text1, text2
    
def clean_big_file(file):
    data = []
    count = 0
    cols = []
    
    import csv
 
    with open(file) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter = ',')
        
        for line in csv_reader:
            cols.append(line)
            break
 
        print(cols[0])
    
        target = input("Which column has the text? Copy and paste here: ")
        print(target)
        
        idx = cols[0].index(target)
        
        try:
            for line in csv_file:
                if count < 5000:
                    print(strip_html(line[idx]))
                    print('=====')
                    count += 1
                else:
                    break
        except:
            print("Unexpected error:", sys.exc_info())
            count += 1
            pass
#             data.append(line.rstrip())
    csv_file.close()
            
    return data
    
def clean_df(df):
    cols = df.columns
    print(cols)
    target = input("Which column has the text? Copy and paste here: ")
    print(target)
    clean_text = []
    for i in range(len(df)):
        try:
            text = strip_html(df[target][i])
            text1, text2 = lem_stem_text(text)
            clean_text.append([text1, text2])
        except:
            pass
#     print(clean_text)
    return clean_text

def clean_list(lst):
    print('processing text data...')
    clean_text = []
    for i in range(len(lst)):
        try:
            lst[i] = lst[i].strip()
            if len(lst[i]) < 2: # removes random empty lines
                lst.pop(i)
                continue
            text = strip_html(lst[i])
            text1, text2 = lem_stem_text(text)
            clean_text.append([text1, text2])
        except IndexError:
            break
        except:
            print("Unexpected error:", sys.exc_info())
            pass
#     print(clean_text)
    return clean_text

def process_data(data):
    
     
    if isinstance(data, pd.DataFrame):
        return clean_df(data)
    elif isinstance(data, list):
        return clean_list(data)
    else:
        print('data type not recognized')
        return ''
    

In [14]:
clean_big_file("D:\\Kaggle\\all-the-news-2-1\\all-the-news-2-1.csv")

['', 'Unnamed: 0', 'date', 'year', 'month', 'day', 'author', 'title', 'article', 'url', 'section', 'publication']
Which column has the text? Copy and paste here: article
article
-
=====
Unexpected error: (<class 'UnicodeDecodeError'>, UnicodeDecodeError('charmap', b'y important, as will the use of whatever money they have in free agency. It\'s certainly possible to build a good defense with a high-paid quarterback, but if the Colts felt that paying Luck such a high sum of money would be difficult, perhaps they should have reconsidered what the final numbers. \n\n\n\n\n\n\n\n\n\nThe highest-paid player on all 32 NFL teams",https://www.businessinsider.com/colts-gm-ryan-grigson-andrew-luck-contract-2016-10,,Business Insider\n2,2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President Donald Trump denied a report on Friday that he had ordered Special Counsel Robert Mueller fired last June, calling it \xe2\x80\x9cfake news

[]

In [None]:
all_data = []
for key in train_docs.keys():
    data = get_data(train_docs[key])
    big_data = process_data(data)
    all_data.append(big_data)

Index(['Id', 'EssaySet', 'Score1', 'Score2', 'EssayText'], dtype='object')
Which column has the text? Copy and paste here: EssayText
EssayText
Index(['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1',
       'rater3_domain1', 'domain1_score', 'rater1_domain2', 'rater2_domain2',
       'domain2_score', 'rater1_trait1', 'rater1_trait2', 'rater1_trait3',
       'rater1_trait4', 'rater1_trait5', 'rater1_trait6', 'rater2_trait1',
       'rater2_trait2', 'rater2_trait3', 'rater2_trait4', 'rater2_trait5',
       'rater2_trait6', 'rater3_trait1', 'rater3_trait2', 'rater3_trait3',
       'rater3_trait4', 'rater3_trait5', 'rater3_trait6'],
      dtype='object')
Which column has the text? Copy and paste here: essay
essay
processing text data...


In [None]:
all_data[0]

In [None]:
all_data[1]

In [None]:
all_data[2]

In [None]:
all_data[3]