# Implementation

## Libraries and Settings

In [1]:
import pandas as pd

from utils import abbreviation_metric, spelling_mistake_metric, unknown_word_metric, grammatical_sentence_metric, lexical_density_metric, lexical_diversity_metric, stop_word_metric, average_sentence_length_metric, uppercased_word_metric, symbol_punctuation_metric, column_average, processed_text

# Store/read data
import pickle
from pathlib import Path

# Sentiment analysis for English
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maryl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\maryl\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
  "class": algorithms.Blowfish,
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\maryl\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## Load Dataset

In [2]:
# Upload pickle file with dataset
with open(Path('dataset.pkl').parent/'Data/Processed data/dataset.pkl', 'rb') as file:
        df = pickle.load(file)

In [3]:
df.shape

(5000, 5)

## Text Processing

In [4]:
# Function to create processed text column
processed_text(df,'text')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over
...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...


In [5]:
# Create pickle file
picklefile = open('Data/Processed Data/processed_dataset.pkl', 'wb')
pickle.dump(df, picklefile)
picklefile.close()

## Context-independent Metrics of Text Data Quality

In [6]:
# Upload pickle file with dataset
with open(Path('processed_dataset').parent/'Data/Processed Data/processed_dataset.pkl', 'rb') as file:
        df = pickle.load(file)

### Metric 1: Abbreviation metric

In [7]:
abbreviation_metric(df,'text','m1_abbrv_txt')
abbreviation_metric(df,'processed_text','m1_abbrv_ptxt')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text,m1_abbrv_txt,m1_abbrv_ptxt
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day,1.0,1.0
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...,1.0,1.0
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...,1.0,1.0
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire,1.0,1.0
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over,1.0,1.0
...,...,...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...,1.0,1.0
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...,1.0,1.0
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...,1.0,1.0
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...,1.0,1.0


### Metric 2: Spelling mistake metric

In [8]:
spelling_mistake_metric(df,'text','m2_sp_mstke_txt')
spelling_mistake_metric(df,'processed_text','m2_sp_mstke_ptxt')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text,m1_abbrv_txt,m1_abbrv_ptxt,m2_sp_mstke_txt,m2_sp_mstke_ptxt
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day,1.0,1.0,0.875000,1.0
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...,1.0,1.0,1.000000,1.0
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...,1.0,1.0,1.000000,1.0
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire,1.0,1.0,1.000000,1.0
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over,1.0,1.0,1.000000,1.0
...,...,...,...,...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...,1.0,1.0,0.977273,1.0
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...,1.0,1.0,0.979381,1.0
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...,1.0,1.0,1.000000,1.0
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...,1.0,1.0,0.933333,1.0


### Metric 3: Unknown words metric

In [9]:
unknown_word_metric(df,'text','m3_unk_wds_txt')
unknown_word_metric(df,'processed_text','m3_unk_wds_ptxt')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text,m1_abbrv_txt,m1_abbrv_ptxt,m2_sp_mstke_txt,m2_sp_mstke_ptxt,m3_unk_wds_txt,m3_unk_wds_ptxt
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day,1.0,1.0,0.875000,1.0,0.736842,1.0
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...,1.0,1.0,1.000000,1.0,0.952381,1.0
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...,1.0,1.0,1.000000,1.0,1.000000,1.0
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire,1.0,1.0,1.000000,1.0,1.000000,1.0
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over,1.0,1.0,1.000000,1.0,0.857143,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...,1.0,1.0,0.977273,1.0,0.977778,1.0
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...,1.0,1.0,0.979381,1.0,0.969072,1.0
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...,1.0,1.0,1.000000,1.0,1.000000,1.0
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...,1.0,1.0,0.933333,1.0,0.900000,1.0


### Metric 4: Grammatical sentence metric

In [10]:
grammatical_sentence_metric(df, 'text','m4_gram_sent_txt')
grammatical_sentence_metric(df, 'processed_text','m4_gram_sent_ptxt')

### Metric 5: Lexical density metric

In [11]:
lexical_density_metric(df, 'text','m5_lex_den_txt')
lexical_density_metric(df, 'processed_text','m5_lex_den_ptxt')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text,m1_abbrv_txt,m1_abbrv_ptxt,m2_sp_mstke_txt,m2_sp_mstke_ptxt,m3_unk_wds_txt,m3_unk_wds_ptxt,m4_gram_sent_txt,m4_gram_sent_ptxt,m5_lex_den_txt,m5_lex_den_ptxt
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day,1.0,1.0,0.875000,1.0,0.736842,1.0,0.000000,0.0,0.454545,0.625000
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...,1.0,1.0,1.000000,1.0,0.952381,1.0,0.000000,0.0,0.363636,0.833333
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...,1.0,1.0,1.000000,1.0,1.000000,1.0,0.333333,1.0,0.500000,0.692308
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire,1.0,1.0,1.000000,1.0,1.000000,1.0,0.000000,0.0,0.500000,0.833333
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over,1.0,1.0,1.000000,1.0,0.857143,1.0,0.000000,0.0,0.500000,0.428571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...,1.0,1.0,0.977273,1.0,0.977778,1.0,0.500000,1.0,0.568182,0.846154
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...,1.0,1.0,0.979381,1.0,0.969072,1.0,0.000000,0.0,0.489796,0.775862
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...,1.0,1.0,1.000000,1.0,1.000000,1.0,0.333333,1.0,0.447368,0.761905
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...,1.0,1.0,0.933333,1.0,0.900000,1.0,0.333333,1.0,0.333333,0.611111


### Metric 6: Lexical diversity metric

In [12]:
lexical_diversity_metric(df, 'text','m6_lex_div_txt')
lexical_diversity_metric(df, 'processed_text','m6_lex_div_ptxt')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text,m1_abbrv_txt,m1_abbrv_ptxt,m2_sp_mstke_txt,m2_sp_mstke_ptxt,m3_unk_wds_txt,m3_unk_wds_ptxt,m4_gram_sent_txt,m4_gram_sent_ptxt,m5_lex_den_txt,m5_lex_den_ptxt,m6_lex_div_txt,m6_lex_div_ptxt
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day,1.0,1.0,0.875000,1.0,0.736842,1.0,0.000000,0.0,0.454545,0.625000,1.000000,1.000000
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...,1.0,1.0,1.000000,1.0,0.952381,1.0,0.000000,0.0,0.363636,0.833333,1.000000,1.000000
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...,1.0,1.0,1.000000,1.0,1.000000,1.0,0.333333,1.0,0.500000,0.692308,1.000000,0.923077
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire,1.0,1.0,1.000000,1.0,1.000000,1.0,0.000000,0.0,0.500000,0.833333,1.000000,1.000000
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over,1.0,1.0,1.000000,1.0,0.857143,1.0,0.000000,0.0,0.500000,0.428571,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...,1.0,1.0,0.977273,1.0,0.977778,1.0,0.500000,1.0,0.568182,0.846154,0.866667,0.923077
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...,1.0,1.0,0.979381,1.0,0.969072,1.0,0.000000,0.0,0.489796,0.775862,0.762887,0.844828
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...,1.0,1.0,1.000000,1.0,1.000000,1.0,0.333333,1.0,0.447368,0.761905,0.868421,0.904762
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...,1.0,1.0,0.933333,1.0,0.900000,1.0,0.333333,1.0,0.333333,0.611111,0.933333,0.944444


### Metric 7: Stop word metric

In [13]:
stop_word_metric(df,'text','m7_stp_wd_txt')
stop_word_metric(df,'processed_text','m7_stp_wd_ptxt')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text,m1_abbrv_txt,m1_abbrv_ptxt,m2_sp_mstke_txt,m2_sp_mstke_ptxt,m3_unk_wds_txt,m3_unk_wds_ptxt,m4_gram_sent_txt,m4_gram_sent_ptxt,m5_lex_den_txt,m5_lex_den_ptxt,m6_lex_div_txt,m6_lex_div_ptxt,m7_stp_wd_txt,m7_stp_wd_ptxt
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day,1.0,1.0,0.875000,1.0,0.736842,1.0,0.000000,0.0,0.454545,0.625000,1.000000,1.000000,0.789474,1.000000
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...,1.0,1.0,1.000000,1.0,0.952381,1.0,0.000000,0.0,0.363636,0.833333,1.000000,1.000000,0.619048,1.000000
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...,1.0,1.0,1.000000,1.0,1.000000,1.0,0.333333,1.0,0.500000,0.692308,1.000000,0.923077,0.722222,0.692308
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire,1.0,1.0,1.000000,1.0,1.000000,1.0,0.000000,0.0,0.500000,0.833333,1.000000,1.000000,0.600000,1.000000
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over,1.0,1.0,1.000000,1.0,0.857143,1.0,0.000000,0.0,0.500000,0.428571,1.000000,1.000000,0.523810,0.571429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...,1.0,1.0,0.977273,1.0,0.977778,1.0,0.500000,1.0,0.568182,0.846154,0.866667,0.923077,0.600000,0.923077
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...,1.0,1.0,0.979381,1.0,0.969072,1.0,0.000000,0.0,0.489796,0.775862,0.762887,0.844828,0.628866,0.844828
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...,1.0,1.0,1.000000,1.0,1.000000,1.0,0.333333,1.0,0.447368,0.761905,0.868421,0.904762,0.631579,0.857143
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...,1.0,1.0,0.933333,1.0,0.900000,1.0,0.333333,1.0,0.333333,0.611111,0.933333,0.944444,0.633333,0.722222


### Metric 8: Average sentence length metric

In [14]:
average_sentence_length_metric(df, 'text','m8_sent_lgth_txt')
average_sentence_length_metric(df, 'processed_text','m8_sent_lgth_ptxt')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text,m1_abbrv_txt,m1_abbrv_ptxt,m2_sp_mstke_txt,m2_sp_mstke_ptxt,...,m4_gram_sent_txt,m4_gram_sent_ptxt,m5_lex_den_txt,m5_lex_den_ptxt,m6_lex_div_txt,m6_lex_div_ptxt,m7_stp_wd_txt,m7_stp_wd_ptxt,m8_sent_lgth_txt,m8_sent_lgth_ptxt
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day,1.0,1.0,0.875000,1.0,...,0.000000,0.0,0.454545,0.625000,1.000000,1.000000,0.789474,1.000000,0.744444,1.0
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...,1.0,1.0,1.000000,1.0,...,0.000000,0.0,0.363636,0.833333,1.000000,1.000000,0.619048,1.000000,0.722500,1.0
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...,1.0,1.0,1.000000,1.0,...,0.333333,1.0,0.500000,0.692308,1.000000,0.923077,0.722222,0.692308,0.802222,1.0
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire,1.0,1.0,1.000000,1.0,...,0.000000,0.0,0.500000,0.833333,1.000000,1.000000,0.600000,1.000000,1.000000,1.0
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over,1.0,1.0,1.000000,1.0,...,0.000000,0.0,0.500000,0.428571,1.000000,1.000000,0.523810,0.571429,0.791875,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...,1.0,1.0,0.977273,1.0,...,0.500000,1.0,0.568182,0.846154,0.866667,0.923077,0.600000,0.923077,0.405000,1.0
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...,1.0,1.0,0.979381,1.0,...,0.000000,0.0,0.489796,0.775862,0.762887,0.844828,0.628866,0.844828,0.290278,1.0
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...,1.0,1.0,1.000000,1.0,...,0.333333,1.0,0.447368,0.761905,0.868421,0.904762,0.631579,0.857143,0.595556,1.0
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...,1.0,1.0,0.933333,1.0,...,0.333333,1.0,0.333333,0.611111,0.933333,0.944444,0.633333,0.722222,0.664444,1.0


### Metric 9: Uppercased word metric

In [15]:
uppercased_word_metric(df, 'text','m9_upp_wd_txt')
uppercased_word_metric(df, 'processed_text','m9_upp_wd_ptxt')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text,m1_abbrv_txt,m1_abbrv_ptxt,m2_sp_mstke_txt,m2_sp_mstke_ptxt,...,m5_lex_den_txt,m5_lex_den_ptxt,m6_lex_div_txt,m6_lex_div_ptxt,m7_stp_wd_txt,m7_stp_wd_ptxt,m8_sent_lgth_txt,m8_sent_lgth_ptxt,m9_upp_wd_txt,m9_upp_wd_ptxt
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day,1.0,1.0,0.875000,1.0,...,0.454545,0.625000,1.000000,1.000000,0.789474,1.000000,0.744444,1.0,0.947368,1.0
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...,1.0,1.0,1.000000,1.0,...,0.363636,0.833333,1.000000,1.000000,0.619048,1.000000,0.722500,1.0,1.000000,1.0
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...,1.0,1.0,1.000000,1.0,...,0.500000,0.692308,1.000000,0.923077,0.722222,0.692308,0.802222,1.0,0.944444,1.0
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire,1.0,1.0,1.000000,1.0,...,0.500000,0.833333,1.000000,1.000000,0.600000,1.000000,1.000000,1.0,1.000000,1.0
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over,1.0,1.0,1.000000,1.0,...,0.500000,0.428571,1.000000,1.000000,0.523810,0.571429,0.791875,1.0,0.952381,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...,1.0,1.0,0.977273,1.0,...,0.568182,0.846154,0.866667,0.923077,0.600000,0.923077,0.405000,1.0,1.000000,1.0
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...,1.0,1.0,0.979381,1.0,...,0.489796,0.775862,0.762887,0.844828,0.628866,0.844828,0.290278,1.0,0.948454,1.0
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...,1.0,1.0,1.000000,1.0,...,0.447368,0.761905,0.868421,0.904762,0.631579,0.857143,0.595556,1.0,0.947368,1.0
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...,1.0,1.0,0.933333,1.0,...,0.333333,0.611111,0.933333,0.944444,0.633333,0.722222,0.664444,1.0,0.933333,1.0


### Metric 10: Punctuation metric

In [16]:
symbol_punctuation_metric(df,'text','m10_punct_txt')
symbol_punctuation_metric(df,'processed_text','m10_punct_ptxt')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text,m1_abbrv_txt,m1_abbrv_ptxt,m2_sp_mstke_txt,m2_sp_mstke_ptxt,...,m6_lex_div_txt,m6_lex_div_ptxt,m7_stp_wd_txt,m7_stp_wd_ptxt,m8_sent_lgth_txt,m8_sent_lgth_ptxt,m9_upp_wd_txt,m9_upp_wd_ptxt,m10_punct_txt,m10_punct_ptxt
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day,1.0,1.0,0.875000,1.0,...,1.000000,1.000000,0.789474,1.000000,0.744444,1.0,0.947368,1.0,0.875000,1.0
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...,1.0,1.0,1.000000,1.0,...,1.000000,1.000000,0.619048,1.000000,0.722500,1.0,1.000000,1.0,0.933333,1.0
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...,1.0,1.0,1.000000,1.0,...,1.000000,0.923077,0.722222,0.692308,0.802222,1.0,0.944444,1.0,0.957746,1.0
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire,1.0,1.0,1.000000,1.0,...,1.000000,1.000000,0.600000,1.000000,1.000000,1.0,1.000000,1.0,1.000000,1.0
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over,1.0,1.0,1.000000,1.0,...,1.000000,1.000000,0.523810,0.571429,0.791875,1.0,0.952381,1.0,0.900000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...,1.0,1.0,0.977273,1.0,...,0.866667,0.923077,0.600000,0.923077,0.405000,1.0,1.000000,1.0,0.984536,1.0
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...,1.0,1.0,0.979381,1.0,...,0.762887,0.844828,0.628866,0.844828,0.290278,1.0,0.948454,1.0,0.975904,1.0
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...,1.0,1.0,1.000000,1.0,...,0.868421,0.904762,0.631579,0.857143,0.595556,1.0,0.947368,1.0,0.965517,1.0
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...,1.0,1.0,0.933333,1.0,...,0.933333,0.944444,0.633333,0.722222,0.664444,1.0,0.933333,1.0,0.959016,1.0


## Context-independent Dimensions of Text Data Quality

### Accuracy

In [17]:
# Accuracy for original text
accuracy_columns_txt= ['m1_abbrv_txt','m2_sp_mstke_txt','m3_unk_wds_txt','m4_gram_sent_txt']
column_average(df,accuracy_columns_txt,'dqd1_accuracy_txt')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text,m1_abbrv_txt,m1_abbrv_ptxt,m2_sp_mstke_txt,m2_sp_mstke_ptxt,...,m6_lex_div_ptxt,m7_stp_wd_txt,m7_stp_wd_ptxt,m8_sent_lgth_txt,m8_sent_lgth_ptxt,m9_upp_wd_txt,m9_upp_wd_ptxt,m10_punct_txt,m10_punct_ptxt,dqd1_accuracy_txt
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day,1.0,1.0,0.875000,1.0,...,1.000000,0.789474,1.000000,0.744444,1.0,0.947368,1.0,0.875000,1.0,0.652961
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...,1.0,1.0,1.000000,1.0,...,1.000000,0.619048,1.000000,0.722500,1.0,1.000000,1.0,0.933333,1.0,0.738095
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...,1.0,1.0,1.000000,1.0,...,0.923077,0.722222,0.692308,0.802222,1.0,0.944444,1.0,0.957746,1.0,0.833333
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire,1.0,1.0,1.000000,1.0,...,1.000000,0.600000,1.000000,1.000000,1.0,1.000000,1.0,1.000000,1.0,0.750000
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over,1.0,1.0,1.000000,1.0,...,1.000000,0.523810,0.571429,0.791875,1.0,0.952381,1.0,0.900000,1.0,0.714286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...,1.0,1.0,0.977273,1.0,...,0.923077,0.600000,0.923077,0.405000,1.0,1.000000,1.0,0.984536,1.0,0.863763
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...,1.0,1.0,0.979381,1.0,...,0.844828,0.628866,0.844828,0.290278,1.0,0.948454,1.0,0.975904,1.0,0.737113
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...,1.0,1.0,1.000000,1.0,...,0.904762,0.631579,0.857143,0.595556,1.0,0.947368,1.0,0.965517,1.0,0.833333
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...,1.0,1.0,0.933333,1.0,...,0.944444,0.633333,0.722222,0.664444,1.0,0.933333,1.0,0.959016,1.0,0.791667


In [18]:
# Accuracy for preprocessed text
accuracy_columns_ptxt= ['m1_abbrv_ptxt','m2_sp_mstke_ptxt','m3_unk_wds_ptxt','m4_gram_sent_ptxt']
column_average(df,accuracy_columns_ptxt,'dqd1_accuracy_ptxt')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text,m1_abbrv_txt,m1_abbrv_ptxt,m2_sp_mstke_txt,m2_sp_mstke_ptxt,...,m7_stp_wd_txt,m7_stp_wd_ptxt,m8_sent_lgth_txt,m8_sent_lgth_ptxt,m9_upp_wd_txt,m9_upp_wd_ptxt,m10_punct_txt,m10_punct_ptxt,dqd1_accuracy_txt,dqd1_accuracy_ptxt
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day,1.0,1.0,0.875000,1.0,...,0.789474,1.000000,0.744444,1.0,0.947368,1.0,0.875000,1.0,0.652961,0.75
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...,1.0,1.0,1.000000,1.0,...,0.619048,1.000000,0.722500,1.0,1.000000,1.0,0.933333,1.0,0.738095,0.75
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...,1.0,1.0,1.000000,1.0,...,0.722222,0.692308,0.802222,1.0,0.944444,1.0,0.957746,1.0,0.833333,1.00
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire,1.0,1.0,1.000000,1.0,...,0.600000,1.000000,1.000000,1.0,1.000000,1.0,1.000000,1.0,0.750000,0.75
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over,1.0,1.0,1.000000,1.0,...,0.523810,0.571429,0.791875,1.0,0.952381,1.0,0.900000,1.0,0.714286,0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...,1.0,1.0,0.977273,1.0,...,0.600000,0.923077,0.405000,1.0,1.000000,1.0,0.984536,1.0,0.863763,1.00
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...,1.0,1.0,0.979381,1.0,...,0.628866,0.844828,0.290278,1.0,0.948454,1.0,0.975904,1.0,0.737113,0.75
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...,1.0,1.0,1.000000,1.0,...,0.631579,0.857143,0.595556,1.0,0.947368,1.0,0.965517,1.0,0.833333,1.00
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...,1.0,1.0,0.933333,1.0,...,0.633333,0.722222,0.664444,1.0,0.933333,1.0,0.959016,1.0,0.791667,1.00


### Completeness

In [19]:
# Completeness for original text
completeness_columns_txt= ['m5_lex_den_txt','m6_lex_div_txt','m7_stp_wd_txt']
column_average(df,completeness_columns_txt,'dqd2_completeness_txt')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text,m1_abbrv_txt,m1_abbrv_ptxt,m2_sp_mstke_txt,m2_sp_mstke_ptxt,...,m7_stp_wd_ptxt,m8_sent_lgth_txt,m8_sent_lgth_ptxt,m9_upp_wd_txt,m9_upp_wd_ptxt,m10_punct_txt,m10_punct_ptxt,dqd1_accuracy_txt,dqd1_accuracy_ptxt,dqd2_completeness_txt
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day,1.0,1.0,0.875000,1.0,...,1.000000,0.744444,1.0,0.947368,1.0,0.875000,1.0,0.652961,0.75,0.748006
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...,1.0,1.0,1.000000,1.0,...,1.000000,0.722500,1.0,1.000000,1.0,0.933333,1.0,0.738095,0.75,0.660895
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...,1.0,1.0,1.000000,1.0,...,0.692308,0.802222,1.0,0.944444,1.0,0.957746,1.0,0.833333,1.00,0.740741
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire,1.0,1.0,1.000000,1.0,...,1.000000,1.000000,1.0,1.000000,1.0,1.000000,1.0,0.750000,0.75,0.700000
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over,1.0,1.0,1.000000,1.0,...,0.571429,0.791875,1.0,0.952381,1.0,0.900000,1.0,0.714286,0.75,0.674603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...,1.0,1.0,0.977273,1.0,...,0.923077,0.405000,1.0,1.000000,1.0,0.984536,1.0,0.863763,1.00,0.678283
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...,1.0,1.0,0.979381,1.0,...,0.844828,0.290278,1.0,0.948454,1.0,0.975904,1.0,0.737113,0.75,0.627183
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...,1.0,1.0,1.000000,1.0,...,0.857143,0.595556,1.0,0.947368,1.0,0.965517,1.0,0.833333,1.00,0.649123
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...,1.0,1.0,0.933333,1.0,...,0.722222,0.664444,1.0,0.933333,1.0,0.959016,1.0,0.791667,1.00,0.633333


In [20]:
# Completeness for preprocessed text
completeness_columns_ptxt= ['m5_lex_den_ptxt','m6_lex_div_ptxt','m7_stp_wd_ptxt']
column_average(df,completeness_columns_ptxt,'dqd2_completeness_ptxt')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text,m1_abbrv_txt,m1_abbrv_ptxt,m2_sp_mstke_txt,m2_sp_mstke_ptxt,...,m8_sent_lgth_txt,m8_sent_lgth_ptxt,m9_upp_wd_txt,m9_upp_wd_ptxt,m10_punct_txt,m10_punct_ptxt,dqd1_accuracy_txt,dqd1_accuracy_ptxt,dqd2_completeness_txt,dqd2_completeness_ptxt
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day,1.0,1.0,0.875000,1.0,...,0.744444,1.0,0.947368,1.0,0.875000,1.0,0.652961,0.75,0.748006,0.875000
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...,1.0,1.0,1.000000,1.0,...,0.722500,1.0,1.000000,1.0,0.933333,1.0,0.738095,0.75,0.660895,0.944444
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...,1.0,1.0,1.000000,1.0,...,0.802222,1.0,0.944444,1.0,0.957746,1.0,0.833333,1.00,0.740741,0.769231
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire,1.0,1.0,1.000000,1.0,...,1.000000,1.0,1.000000,1.0,1.000000,1.0,0.750000,0.75,0.700000,0.944444
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over,1.0,1.0,1.000000,1.0,...,0.791875,1.0,0.952381,1.0,0.900000,1.0,0.714286,0.75,0.674603,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...,1.0,1.0,0.977273,1.0,...,0.405000,1.0,1.000000,1.0,0.984536,1.0,0.863763,1.00,0.678283,0.897436
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...,1.0,1.0,0.979381,1.0,...,0.290278,1.0,0.948454,1.0,0.975904,1.0,0.737113,0.75,0.627183,0.821839
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...,1.0,1.0,1.000000,1.0,...,0.595556,1.0,0.947368,1.0,0.965517,1.0,0.833333,1.00,0.649123,0.841270
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...,1.0,1.0,0.933333,1.0,...,0.664444,1.0,0.933333,1.0,0.959016,1.0,0.791667,1.00,0.633333,0.759259


### Consistency

In [21]:
# Consistency for original text
consistency_columns_txt= ['m8_sent_lgth_txt','m9_upp_wd_txt','m10_punct_txt']
column_average(df,consistency_columns_txt,'dqd3_consistency_txt')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text,m1_abbrv_txt,m1_abbrv_ptxt,m2_sp_mstke_txt,m2_sp_mstke_ptxt,...,m8_sent_lgth_ptxt,m9_upp_wd_txt,m9_upp_wd_ptxt,m10_punct_txt,m10_punct_ptxt,dqd1_accuracy_txt,dqd1_accuracy_ptxt,dqd2_completeness_txt,dqd2_completeness_ptxt,dqd3_consistency_txt
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day,1.0,1.0,0.875000,1.0,...,1.0,0.947368,1.0,0.875000,1.0,0.652961,0.75,0.748006,0.875000,0.855604
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...,1.0,1.0,1.000000,1.0,...,1.0,1.000000,1.0,0.933333,1.0,0.738095,0.75,0.660895,0.944444,0.885278
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...,1.0,1.0,1.000000,1.0,...,1.0,0.944444,1.0,0.957746,1.0,0.833333,1.00,0.740741,0.769231,0.901471
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire,1.0,1.0,1.000000,1.0,...,1.0,1.000000,1.0,1.000000,1.0,0.750000,0.75,0.700000,0.944444,1.000000
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over,1.0,1.0,1.000000,1.0,...,1.0,0.952381,1.0,0.900000,1.0,0.714286,0.75,0.674603,0.666667,0.881419
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...,1.0,1.0,0.977273,1.0,...,1.0,1.000000,1.0,0.984536,1.0,0.863763,1.00,0.678283,0.897436,0.796512
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...,1.0,1.0,0.979381,1.0,...,1.0,0.948454,1.0,0.975904,1.0,0.737113,0.75,0.627183,0.821839,0.738212
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...,1.0,1.0,1.000000,1.0,...,1.0,0.947368,1.0,0.965517,1.0,0.833333,1.00,0.649123,0.841270,0.836147
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...,1.0,1.0,0.933333,1.0,...,1.0,0.933333,1.0,0.959016,1.0,0.791667,1.00,0.633333,0.759259,0.852265


In [22]:
# Consistency for preprocessed text
consistency_columns_ptxt= ['m8_sent_lgth_ptxt','m9_upp_wd_ptxt','m10_punct_ptxt']
column_average(df,consistency_columns_ptxt,'dqd3_consistency_ptxt')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text,m1_abbrv_txt,m1_abbrv_ptxt,m2_sp_mstke_txt,m2_sp_mstke_ptxt,...,m9_upp_wd_txt,m9_upp_wd_ptxt,m10_punct_txt,m10_punct_ptxt,dqd1_accuracy_txt,dqd1_accuracy_ptxt,dqd2_completeness_txt,dqd2_completeness_ptxt,dqd3_consistency_txt,dqd3_consistency_ptxt
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day,1.0,1.0,0.875000,1.0,...,0.947368,1.0,0.875000,1.0,0.652961,0.75,0.748006,0.875000,0.855604,1.0
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...,1.0,1.0,1.000000,1.0,...,1.000000,1.0,0.933333,1.0,0.738095,0.75,0.660895,0.944444,0.885278,1.0
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...,1.0,1.0,1.000000,1.0,...,0.944444,1.0,0.957746,1.0,0.833333,1.00,0.740741,0.769231,0.901471,1.0
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire,1.0,1.0,1.000000,1.0,...,1.000000,1.0,1.000000,1.0,0.750000,0.75,0.700000,0.944444,1.000000,1.0
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over,1.0,1.0,1.000000,1.0,...,0.952381,1.0,0.900000,1.0,0.714286,0.75,0.674603,0.666667,0.881419,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...,1.0,1.0,0.977273,1.0,...,1.000000,1.0,0.984536,1.0,0.863763,1.00,0.678283,0.897436,0.796512,1.0
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...,1.0,1.0,0.979381,1.0,...,0.948454,1.0,0.975904,1.0,0.737113,0.75,0.627183,0.821839,0.738212,1.0
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...,1.0,1.0,1.000000,1.0,...,0.947368,1.0,0.965517,1.0,0.833333,1.00,0.649123,0.841270,0.836147,1.0
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...,1.0,1.0,0.933333,1.0,...,0.933333,1.0,0.959016,1.0,0.791667,1.00,0.633333,0.759259,0.852265,1.0


## Context-independent Measurement of Text Data Quality

In [23]:
# Context-independent Measurement of Text Data Quality for original text
DQ_columns_txt= ['m1_abbrv_txt','m2_sp_mstke_txt','m3_unk_wds_txt','m4_gram_sent_txt','m5_lex_den_txt','m6_lex_div_txt','m7_stp_wd_txt','m8_sent_lgth_txt','m9_upp_wd_txt','m10_punct_txt']
column_average(df,DQ_columns_txt,'TDQ_txt')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text,m1_abbrv_txt,m1_abbrv_ptxt,m2_sp_mstke_txt,m2_sp_mstke_ptxt,...,m9_upp_wd_ptxt,m10_punct_txt,m10_punct_ptxt,dqd1_accuracy_txt,dqd1_accuracy_ptxt,dqd2_completeness_txt,dqd2_completeness_ptxt,dqd3_consistency_txt,dqd3_consistency_ptxt,TDQ_txt
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day,1.0,1.0,0.875000,1.0,...,1.0,0.875000,1.0,0.652961,0.75,0.748006,0.875000,0.855604,1.0,0.742267
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...,1.0,1.0,1.000000,1.0,...,1.0,0.933333,1.0,0.738095,0.75,0.660895,0.944444,0.885278,1.0,0.759090
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...,1.0,1.0,1.000000,1.0,...,1.0,0.957746,1.0,0.833333,1.00,0.740741,0.769231,0.901471,1.0,0.825997
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire,1.0,1.0,1.000000,1.0,...,1.0,1.000000,1.0,0.750000,0.75,0.700000,0.944444,1.000000,1.0,0.810000
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over,1.0,1.0,1.000000,1.0,...,1.0,0.900000,1.0,0.714286,0.75,0.674603,0.666667,0.881419,1.0,0.752521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...,1.0,1.0,0.977273,1.0,...,1.0,0.984536,1.0,0.863763,1.00,0.678283,0.897436,0.796512,1.0,0.787944
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...,1.0,1.0,0.979381,1.0,...,1.0,0.975904,1.0,0.737113,0.75,0.627183,0.821839,0.738212,1.0,0.704464
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...,1.0,1.0,1.000000,1.0,...,1.0,0.965517,1.0,0.833333,1.00,0.649123,0.841270,0.836147,1.0,0.778914
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...,1.0,1.0,0.933333,1.0,...,1.0,0.959016,1.0,0.791667,1.00,0.633333,0.759259,0.852265,1.0,0.762346


In [24]:
# Context-independent Measurement of Text Data Quality for preprocessed text
DQ_columns_ptxt= ['m1_abbrv_ptxt','m2_sp_mstke_ptxt','m3_unk_wds_ptxt','m4_gram_sent_ptxt','m5_lex_den_ptxt','m6_lex_div_ptxt','m7_stp_wd_ptxt','m8_sent_lgth_ptxt','m9_upp_wd_ptxt','m10_punct_ptxt']
column_average(df,DQ_columns_ptxt,'TDQ_ptxt')

Unnamed: 0,text,human_sentiment,text_type,length,length_classification,processed_text,m1_abbrv_txt,m1_abbrv_ptxt,m2_sp_mstke_txt,m2_sp_mstke_ptxt,...,m10_punct_txt,m10_punct_ptxt,dqd1_accuracy_txt,dqd1_accuracy_ptxt,dqd2_completeness_txt,dqd2_completeness_ptxt,dqd3_consistency_txt,dqd3_consistency_ptxt,TDQ_txt,TDQ_ptxt
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,twitter,115,medium,aww bummer shoulda got david carr third day,1.0,1.0,0.875000,1.0,...,0.875000,1.0,0.652961,0.75,0.748006,0.875000,0.855604,1.0,0.742267,0.862500
1,is upset that he can't update his Facebook by ...,negative,twitter,111,medium,upset cannot update facebook texting might cry...,1.0,1.0,1.000000,1.0,...,0.933333,1.0,0.738095,0.75,0.660895,0.944444,0.885278,1.0,0.759090,0.883333
2,@Kenichan I dived many times for the ball. Man...,neutral,twitter,89,medium,dived many times for the ball managed save the...,1.0,1.0,1.000000,1.0,...,0.957746,1.0,0.833333,1.00,0.740741,0.769231,0.901471,1.0,0.825997,0.930769
3,my whole body feels itchy and like its on fire,negative,twitter,47,short,whole body feels itchy like fire,1.0,1.0,1.000000,1.0,...,1.000000,1.0,0.750000,0.75,0.700000,0.944444,1.000000,1.0,0.810000,0.883333
4,"@nationwideclass no, it's not behaving at all....",neutral,twitter,111,medium,no not behaving mad cannot see over,1.0,1.0,1.000000,1.0,...,0.900000,1.0,0.714286,0.75,0.674603,0.666667,0.881419,1.0,0.752521,0.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,i love it! the berry scent is faint and delici...,positive,review,238,medium,love the berry scent faint delicious the produ...,1.0,1.0,0.977273,1.0,...,0.984536,1.0,0.863763,1.00,0.678283,0.897436,0.796512,1.0,0.787944,0.969231
996,I use this almost every night and I wake up wi...,neutral,review,511,long,use almost every night wake little left over t...,1.0,1.0,0.979381,1.0,...,0.975904,1.0,0.737113,0.75,0.627183,0.821839,0.738212,1.0,0.704464,0.846552
997,This mask is great! It smells delicious and my...,positive,review,182,medium,mask great smells delicious lips feel plump sm...,1.0,1.0,1.000000,1.0,...,0.965517,1.0,0.833333,1.00,0.649123,0.841270,0.836147,1.0,0.778914,0.952381
998,These is similar to Vaseline with a tint of co...,negative,review,151,medium,similar vaseline tint color but prettier jar o...,1.0,1.0,0.933333,1.0,...,0.959016,1.0,0.791667,1.00,0.633333,0.759259,0.852265,1.0,0.762346,0.927778


In [25]:
# Create pickle file
picklefile = open('Data/Results/dataset_TDQ.pkl', 'wb')
pickle.dump(df, picklefile)
picklefile.close()

## Sentiment Analysis

In [35]:
# Upload pickle file with dataset
with open(Path('dataset_TDQ.pkl').parent/'Data/Results/dataset_TDQ.pkl', 'rb') as file:
        df = pickle.load(file)

In [36]:
# Load Sentiment Analysis Model
sentiment = SentimentIntensityAnalyzer()

In [37]:
# Sentiment Analysis for original text
sentiment_txt=[]
for value in df['text']:
    sent_value = sentiment.polarity_scores(value)['compound']
    # Convert value to classification
    if sent_value >= 0.05:
        sent_value='positive'
    elif sent_value <= -0.05:
        sent_value = 'negative'
    else:
        sent_value ='neutral'
    sentiment_txt.append(sent_value)
df['sentiment_txt']= sentiment_txt

In [38]:
# Sentiment Analysis for preprocessed text
sentiment_ptxt=[]
for value in df['processed_text']:
    sent_value = sentiment.polarity_scores(value)['compound']
    # Convert value to classification
    if sent_value >= 0.05:
        sent_value='positive'
    elif sent_value <= -0.05:
        sent_value ='negative'
    else:
        sent_value ='neutral'
    sentiment_ptxt.append(sent_value)
df['sentiment_ptxt']= sentiment_ptxt

## Sentiment Analysis Performance

In [39]:
df['comparison_txt'] = df['human_sentiment'] == df['sentiment_txt']
df['comparison_ptxt'] = df['human_sentiment'] == df['sentiment_ptxt']

In [40]:
df['comparison_txt'] = df['comparison_txt'].astype(int)
df['comparison_ptxt']= df['comparison_ptxt'].astype(int)

## Dataset to analyze

In [41]:
df.to_excel('Data/Results/dataset_to_analyze.xlsx', index=False)

In [42]:
# Create pickle file
picklefile = open('Data/Results/dataset_to_analyze.pkl', 'wb')
pickle.dump(df, picklefile)
picklefile.close()