# Dataset data analysis

In [2]:
import re
import numpy as np
from clyent import color
from numpy import mean
from numpy import std
import pandas as pandas
import matplotlib.pyplot as plt
import seaborn as seaborn

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report,roc_auc_score,auc,roc_curve
from sklearn.svm import SVC

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
from nltk.corpus import stopwords

from numba import jit
import json
import string
punctuations_list = string.punctuation


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/peshmerge/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data loading

#### Read both data files.
Note: The file wiki-auto-part-2-data.json has no sentence_alignments, while the wiki-auto-part-1-data.json has sentence_alignment and paragraph_alignment

In [68]:
dataset_cols = ['simple', 'normal', 'sentence_alignment','paragraph_alignment']
df_wiki_part_1 = pandas.read_json(
    'wiki-auto-part-1-data.json',
    encoding='utf8',
    orient='index'
)
df_wiki_part_1.head(10)

Unnamed: 0,simple,normal,sentence_alignment,paragraph_alignment
0,"{'id': 702227, 'title': 'Lata Mondal', 'url': ...","{'id': 41918715, 'title': 'Lata Mondal', 'url'...","[[simple-702227-0-0, normal-41918715-0-0], [si...","[[simple-702227-0, normal-41918715-0]]"
1,"{'id': 697879, 'title': 'Tuulikki Ukkola', 'ur...","{'id': 60918445, 'title': 'Tuulikki Ukkola', '...","[[simple-697879-0-0, normal-60918445-0-0], [si...","[[simple-697879-0, normal-60918445-0], [simple..."
2,"{'id': 120452, 'title': 'Kot Sarang', 'url': '...","{'id': 14570332, 'title': 'Kot Sarang', 'url':...","[[simple-120452-0-0, normal-14570332-0-0]]","[[simple-120452-0, normal-14570332-0]]"
3,"{'id': 276895, 'title': 'Elisabeth Charlotte o...","{'id': 946890, 'title': 'Elizabeth Charlotte, ...","[[simple-276895-0-0, normal-946890-0-0], [simp...","[[simple-276895-0, normal-946890-0], [simple-2..."
4,"{'id': 697649, 'title': 'Rory Stewart', 'url':...","{'id': 2656347, 'title': 'Rory Stewart', 'url'...","[[simple-697649-0-0, normal-2656347-0-0], [sim...","[[simple-697649-0, normal-2656347-0], [simple-..."
5,"{'id': 413275, 'title': 'Not Without My Daught...","{'id': 1808380, 'title': 'Not Without My Daugh...","[[simple-413275-0-0, normal-1808380-0-0], [sim...","[[simple-413275-0, normal-1808380-0]]"
6,"{'id': 256064, 'title': 'NBC Symphony Orchestr...","{'id': 150698, 'title': 'NBC Symphony Orchestr...","[[simple-256064-0-0, normal-150698-0-0], [simp...","[[simple-256064-0, normal-150698-0]]"
7,"{'id': 516259, 'title': 'Priyanka Chopra', 'ur...","{'id': 838950, 'title': 'Priyanka Chopra', 'ur...","[[simple-516259-0-0, normal-838950-0-0], [simp...","[[simple-516259-0, normal-838950-0], [simple-5..."
8,"{'id': 103132, 'title': 'Michigan Internationa...","{'id': 966394, 'title': 'Michigan Internationa...","[[simple-103132-0-0, normal-966394-0-0], [simp...","[[simple-103132-0, normal-966394-0], [simple-1..."
9,"{'id': 425394, 'title': 'Loneliness', 'url': '...","{'id': 1570429, 'title': 'Loneliness', 'url': ...","[[simple-425394-0-0, normal-1570429-0-0], [sim...","[[simple-425394-0, normal-1570429-0], [simple-..."


In [69]:
dataset_cols = ['simple', 'normal', 'sentence_alignment']
df_wiki_part_2 = pandas.read_json(
    'wiki-auto-part-2-data.json',
    encoding='utf8',
    orient='index'
)
df_wiki_part_2.head(10)

Unnamed: 0,simple,normal,sentence_alignment
0,"{'id': 78856, 'title': 'Vex, Switzerland', 'ur...","{'id': 7023314, 'title': 'Vex, Switzerland', '...",[]
1,"{'id': 193292, 'title': 'South Portland, Maine...","{'id': 115950, 'title': 'South Portland, Maine...",[]
2,"{'id': 195874, 'title': 'Kosuke Suda', 'url': ...","{'id': 18746548, 'title': 'Kosuke Suda', 'url'...",[]
3,"{'id': 608316, 'title': 'Cheng Lim LRT Station...","{'id': 3490013, 'title': 'Cheng Lim LRT statio...",[]
4,"{'id': 684573, 'title': 'Swifton, Arkansas', '...","{'id': 107051, 'title': 'Swifton, Arkansas', '...",[]
5,"{'id': 152877, 'title': 'Burley, Idaho', 'url'...","{'id': 110663, 'title': 'Burley, Idaho', 'url'...",[]
6,"{'id': 200958, 'title': 'S.S.D. Licata 1931', ...","{'id': 6968112, 'title': 'A.S.D. Licata 1931',...",[]
7,"{'id': 519888, 'title': 'Vivian, Louisiana', '...","{'id': 115518, 'title': 'Vivian, Louisiana', '...",[]
8,"{'id': 342309, 'title': '201 BC', 'url': 'http...","{'id': 60498, 'title': '201 BC', 'url': 'https...",[]
9,"{'id': 164901, 'title': 'Saint-Lys', 'url': 'h...","{'id': 8104124, 'title': 'Saint-Lys', 'url': '...",[]


In [None]:

'''
leave it here for now!
This function is used to convert the structure of the json files to group the data by [simple, normal, sentence_alignment]
'''
def reformat_json_file(input_file, outpu_file):
    input_json_file = open(input_file,'r')
    input_data = json.load(input_json_file)
    input_json_file.close()
    data_list = []
    for key,value in input_data.items():
        data_list.append(value)

    output_json_file = open(outpu_file,"w")
    output_data = json.dumps(data_list)
    output_json_file.write(output_data)
    output_json_file.close()
    print("finished")