In [1]:
# !pip install tensorflow_data_validation[visualization]
# !pip install tensorflow-data-validation
!pip install --upgrade 'tensorflow_data_validation[visualization]'

ERROR: Invalid requirement: ''tensorflow_data_validation[visualization]''



In [2]:
import tensorflow_data_validation as tfdv
import pandas as pd



## Read the training data

In [3]:
# 
df_train = pd.read_csv('data/train.tsv', sep='\t')
df_train.head()

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,['r']
1,mysql select all records where a datetime fiel...,"['php', 'mysql']"
2,How to terminate windows phone 8.1 app,['c#']
3,get current time in a specific country via jquery,"['javascript', 'jquery']"
4,Configuring Tomcat to Use SSL,['java']


In [4]:
df_train.describe()

Unnamed: 0,title,tags
count,100000,100000
unique,99984,7066
top,Conversion failed when converting date and/or ...,['java']
freq,3,7026


In [5]:
df_train.shape

(100000, 2)

## Read the validation data

In [6]:
df_val = pd.read_csv('data/validation.tsv', sep='\t')
df_val.head()

Unnamed: 0,title,tags
0,Why odbc_exec always fail?,"['php', 'sql']"
1,Access a base classes variable from within a c...,['javascript']
2,"Content-Type ""application/json"" not required i...","['ruby-on-rails', 'ruby']"
3,Sessions in Sinatra: Used to Pass Variable,"['ruby', 'session']"
4,"Getting error - type ""json"" does not exist - i...","['ruby-on-rails', 'ruby', 'json']"


In [7]:
df_val.describe()

Unnamed: 0,title,tags
count,30000,30000
unique,29996,3482
top,Object reference not set to an instance of an ...,['java']
freq,2,2044


In [8]:
df_val.shape

(30000, 2)

## Read the test data

In [9]:
df_test = pd.read_csv('data/test.tsv', sep='\t')
df_test.head()

Unnamed: 0,title
0,Warning: mysql_query() expects parameter 2 to ...
1,get click coordinates from <input type='image'...
2,How to implement cloud storage for media asset...
3,What is catcomplete in jQuery's autocomplete p...
4,Error building Android app with Cordova 3.1 CLI


## Use tfdv to compute statistics for the training set

In [10]:
train_stats = tfdv.generate_statistics_from_dataframe(df_train)

In [11]:
tfdv.visualize_statistics(train_stats)

## Use tfdv to compute statistics for the validation set

In [12]:
val_stats = tfdv.generate_statistics_from_dataframe(df_val)
tfdv.visualize_statistics(val_stats)

## Use tfdv to compute statistics for the training set

In [13]:
test_stats = tfdv.generate_statistics_from_dataframe(df_test)
tfdv.visualize_statistics(test_stats)

# Data validation

## Create schema

In [14]:
schema = tfdv.infer_schema(train_stats)

In [15]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'title',BYTES,required,,-
'tags',BYTES,required,,-


## Inspect anomalies in validation set

In [16]:
anomalies = tfdv.validate_statistics(val_stats, schema=schema)
tfdv.display_anomalies(anomalies)

## Inspect anomalies in test set

In [17]:
anomalies = tfdv.validate_statistics(test_stats, schema=schema)
tfdv.display_anomalies(anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'tags',Column dropped,Column is completely missing


In [18]:
schema.default_environment.append('TRAINING')
schema.default_environment.append('TESTING')

In [19]:
# Specify that tags column is not in TESTING environment.
tfdv.get_feature(schema, 'tags').not_in_environment.append('TESTING')

In [20]:
serving_anomalies_with_env = tfdv.validate_statistics(
        test_stats, schema, environment='TESTING') 
tfdv.display_anomalies(serving_anomalies_with_env)

In [21]:
tfdv.visualize_statistics(lhs_statistics=train_stats, rhs_statistics=val_stats, lhs_name='TRAIN', rhs_name='VAL')

In [22]:
tfdv.visualize_statistics(lhs_statistics=train_stats, rhs_statistics=test_stats, lhs_name='TRAIN', rhs_name='TEST')

# Extension

## Title length

In [31]:
title_length = [len(x.split()) for x in df_train['title'].tolist()]
# print(title_length[0:5])
df_train_extension = pd.DataFrame(title_length, columns = ['title length'])

title_length = [len(x.split()) for x in df_val['title'].tolist()]
# print(title_length[0:5])
df_val_extension = pd.DataFrame(title_length, columns = ['title length'])

title_length = [len(x.split()) for x in df_test['title'].tolist()]
# print(title_length[0:5])
df_test_extension = pd.DataFrame(title_length, columns = ['title length'])


## Number of tags

In [53]:
num_of_tags = [len(x.split(',')) for x in df_train['tags'].tolist()]
df_train_extension['num of tags'] = num_of_tags

num_of_tags = [len(x.split(',')) for x in df_val['tags'].tolist()]
df_val_extension['num of tags'] = num_of_tags


## Misspelled words

In [55]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading https://files.pythonhosted.org/packages/b4/e3/64a6a11f885d2f95a680e5d7bfa6aee3e3eb5f7671ff5bba0a80cd890fb3/pyspellchecker-0.6.3-py3-none-any.whl (2.7MB)
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.6.3


In [59]:
from spellchecker import SpellChecker

bye
{'war', 'atr', 'water', 'wart', 'wat', 'wath', 'watc', 'wanr', 'waar', 'wate', 'watt', 'wats'}


In [84]:
def misspelled(word_list):
    spell = SpellChecker()

# find those words that may be misspelled
    count = 0
    for word in word_list:
        # Get the one `most likely` answer
#         print(word)
        correct = spell.correction(word)
        if correct != word:
            count += 1
#             print(word, correct)
#     print(count)
    return count

In [None]:
misspelled = [misspelled(x.split()) for x in df_train['title'].tolist()]
# print(misspelled)
df_train_extension['misspelled titles'] = misspelled

# misspelled = [misspelled(x.split()) for x in df_val['title'].tolist()]
# # print(misspelled)
# df_val_extension['misspelled titles'] = misspelled

# misspelled = [misspelled(x.split()) for x in df_test['title'].tolist()]
# # print(misspelled)
# df_test_extension['misspelled titles'] = misspelled


## Most occurred tags

In [96]:
def word_count(titles):
    # Dictionary of all words from train corpus with their counts.
    words_counts = {}

    for sentence in titles:
        for word in sentence.split():
            if word in words_counts:
                words_counts[word] += 1
            else:
                words_counts[word] = 1
    return words_counts
                
def tag_count(tags):
    tags_counts = {}
    for tags in tags:
        for tag in tags:
            if tag in tags_counts:
                tags_counts[tag] += 1
            else:
                tags_counts[tag] = 1
    return tags_counts

In [113]:
words = word_count(df_train['title'].tolist())
tags = tag_count(df_train['tags'].tolist())
df_train_words = pd.DataFrame.from_dict(words, orient = 'index', columns=['count'])
df_train_tags = pd.DataFrame.from_dict(tags, orient = 'index', columns=['count'])

words = word_count(df_val['title'].tolist())
tags = tag_count(df_val['tags'].tolist())
df_val_words = pd.DataFrame.from_dict(words, orient = 'index', columns=['count'])
df_val_tags = pd.DataFrame.from_dict(tags, orient = 'index', columns=['count'])

words = word_count(df_test['title'].tolist())
df_test_words = pd.DataFrame.from_dict(words, orient = 'index', columns=['count'])

                   count
How                17192
to                 34778
draw                 120
a                  24056
stacked               13
...                  ...
"<?xml"                1
data-rel="dialog"      1
Obj-c,                 1
Datetimes              1
Since                  1

[65531 rows x 1 columns]
