In [1]:
### Generate three dataframes -------------------------
# df_original --- train1800 and test1800 in data_original
# df_error    --- result of out of bag forecast in data_handlabeling_cycle1
# df_handlabeled --- hand-labeled data in data_handlabeling_cycle1

### Analyze handlabeled data -------------------------
## df_merged  --- df_original + df_handlabeled

### Refine handlabeled data -------------------------
## then generate df_refined, df_conflict, df_one_unknown from df_merged

### Generate train, test data -------------------------
## then generate train, test from df_refined + df_original_kept

### Move files to s3
## offline: df_conflict, df_one_unknown, train_refined, test_refined

# Generate three dfs

- df_error    --- result of out of bag forecast
- df_original_kept --- from (df_original (=train1800 and test1800) minus df_error)
- df_handlabeled --- hand-labeled data

In [2]:
# # Install a pip package in the current Jupyter kernel
# import sys
# !{sys.executable} -m pip install s3fs

import numpy as np
import pandas as pd
import cPickle
from collections import defaultdict
import re
import sys
import os
from sklearn.model_selection import train_test_split

# URL0 = 's3://smart-newsdev-dmp/tmp/data/classification/error_handlabeled_combined.csv'
# URL1 = '../data/train1800.csv'
# URL2 = '../data/test1800.csv'
# URL3 = 'error_combined.csv'

URL0 = 's3://smart-newsdev-dmp/tmp/data/classification/data_handlabeling_cycle1/error_handlabeled_combined.csv'
URL1 = 's3://smart-newsdev-dmp/tmp/data/classification/data_original/train1800.csv'
URL2 = 's3://smart-newsdev-dmp/tmp/data/classification/data_original/test1800.csv'
URL3 = 's3://smart-newsdev-dmp/tmp/data/classification/data_handlabeling_cycle1/error_combined.csv'


In [3]:
def clean_idx(data):
    data = data[data['idx'].notnull()]
    data['idx'] = data['idx'].map(int)
    
    return data
    
def remove_short_texts(data):
    pd.options.mode.chained_assignment = None
    data = clean_idx(data)
    
    data['text'] = data['text'].map(str).apply(lambda x: re.sub("\[article_title\]\s\[article_body\]","", x))
    data['text'] = data['text'].map(str).apply(lambda x: re.sub("News Entertainment Lifestyle Tech & Innovation All Sections","", x))

    data['text'] = data['text'].map(str).apply(lambda x: " ".join(x.split()))
    data['text'] = data['text'].map(str).apply(lambda x: x.strip())
    data['title'] = data['title'].map(str).apply(lambda x: x.strip())
    
    
    data['char_length1'] = data['text'].map(str).apply(len)
    data = data[data['char_length1'] >= 200]
    data['char_length2'] = data['title'].map(str).apply(len)
    data = data[data['char_length2'] >= 22]
    data.drop(['char_length1', 'char_length2'], axis=1, inplace=True)
      
    return data

def generate_original_df(URL1, URL2):

    df1 = pd.read_csv(URL1, sep='|')
    df2 = pd.read_csv(URL2, sep='|')

    df_original = pd.concat([df1, df2])

    df_original = remove_short_texts(df_original)
    
    return df_original


df_original = generate_original_df(URL1, URL2)
print (df_original.shape)

df_error = pd.read_csv(URL3, sep='|')
df_error = clean_idx(df_error)
print (df_error.shape)

df_handlabeled = pd.read_csv(URL0, sep='|')
df_handlabeled.drop(['title', 'text', 'url'], axis=1, inplace=True)
print (df_handlabeled.shape)

print (df_original.shape)
df_original_kept = df_original[~df_original['idx'].isin(df_error['idx'].values)]
print (df_original_kept.shape)

(14465, 6)
(6565, 6)
(4000, 5)
(14465, 6)
(8209, 6)


# Analyze handlabeled data

In [4]:
def merge_original_handlabeled(df_original, df_handlabeled):

    df_merged = pd.merge(df_original[df_original['idx'].isin(df_handlabeled['idx'].values)], 
                         df_handlabeled, how ='left', left_on=['idx'], right_on=['idx'])
    
    return df_merged

def assess_df(df):
    # fill na
    df['Category_person1'].fillna(0, inplace=True)
    df['Category_person2'].fillna(0, inplace=True) 
    
    total_rows = df.shape[0]
    null_rows = df[(df['Category_person1'] ==0) & (df['Category_person2'] == 0)].shape[0] # both nan
    print ("Number of rows is {}".format(total_rows))
    print ("Number of rows at least one are labeled is {}".format(total_rows - null_rows))
    
    df = df[(df['Category_person1']!=0) | (df['Category_person2']!=0)] # remove 0 and 0
    
    print ("Breakdown ---------------------------------------------------------------")
    
    print ("2 Unknowns is {}".format(df[(df['Category_person1']=='UNKNOWN')& (df['Category_person2']=='UNKNOWN')].shape[0]))
    df = df[(df['Category_person1'] !='UNKNOWN') | (df['Category_person2'] !='UNKNOWN')] # remove 2 unknowns

    print ("2 baddata is {}".format(df[(df['Category_person1']=='BADDATA')& (df['Category_person2']=='BADDATE')].shape[0]))
    df = df[(df['Category_person1'] !='BADDATE') | (df['Category_person2'] !='BADDATE')] # remove 2 baddata
    
    print ("2 votes is {}".format(df [df['Category_person1'] == df['Category_person2'] ].shape[0]))
    df = df [df['Category_person1'] != df['Category_person2'] ] # remove 2 votes

    print ("1 Baddata is {}".format(df[(df['Category_person1']=='BADDATA') | (df['Category_person2']=='BADDATA')].shape[0]))
    df = df[(df['Category_person1'] !='BADDATA') & (df['Category_person2'] !='BADDATA')] # remove 1 baddata
    
    print ("1 vote + 'UNKNOWN' is {}".format(df [(df['Category_person1']=='UNKNOWN') | (df['Category_person2']=='UNKNOWN') ].shape[0]))
    df = df[(df['Category_person1'] != 'UNKNOWN') & (df['Category_person2'] != 'UNKNOWN')] # removce 1 unknown
    
    print ("1 vote + non-vote is {}".format(df [(df['Category_person1']==0) | (df['Category_person2']==0) ].shape[0]))
    df = df[(df['Category_person1'] != 0) & (df['Category_person2'] != 0)] # remove 1 label
    
    print ("Conflict votes is {}".format(df.shape[0]))
    
    
df_merged = merge_original_handlabeled(df_original, df_handlabeled)
df_merged.shape
assess_df(df_merged)

Number of rows is 4000
Number of rows at least one are labeled is 2829
Breakdown ---------------------------------------------------------------
2 Unknowns is 8
2 baddata is 0
2 votes is 438
1 Baddata is 392
1 vote + 'UNKNOWN' is 201
1 vote + non-vote is 1619
Conflict votes is 171


# Refine handlabeled data

In [5]:
def refine_df(df):
    # fillna with 0
    df['Category_person1'].fillna(0, inplace=True)
    df['Category_person2'].fillna(0, inplace=True)    
    
    # remove nan and nan
    df = df[(df['Category_person1']!=0) | (df['Category_person2']!=0)] # remove 0 and 0
    
    # remove two unknowns (keep one known)
    df = df[(df['Category_person1'] !='UNKNOWN') | (df['Category_person2'] !='UNKNOWN')]
    
    # remove one baddata
    df = df[(df['Category_person1'] !='BADDATA') & (df['Category_person2'] !='BADDATA')]
    
    
    # create new column 're_category'
    def column_rule(row):
        
        # two votes
        if row['Category_person1'] == row['Category_person2']:
            val = row['Category_person1']            
        
        # prioritize world
        elif row['Category_person1'] == 'EN_US_WORLD' or row['Category_person2'] == 'EN_US_WORLD':
            val = 'EN_US_WORLD'
            
        # overwrite in case either one is null or unknown
        elif row['Category_person1'] and (row['Category_person2'] == 0  or row['Category_person2'] == 'UNKNOWN'):
            val = row['Category_person1']
        
        # overwrite in case either one is null or unknown
        elif (row['Category_person1'] ==0 or row['Category_person1'] == 'UNKNOWN') and row['Category_person2']:
            val = row['Category_person2']
        
        else:
            val = -1
            
        return val
    
    df['re_category'] = df.apply(column_rule, axis=1)
    
    df_conflict = df[df['re_category'] == -1].loc[:,['idx','title','text','url','category','Category_person1','note_person1','Category_person2','note_person2']]
    
    df_refined = df[df['re_category'] != -1]
    
    print ('df_refined:')
    print (df_refined.shape[0])
    print ('category == re_category:')
    print (df_refined[df_refined['re_category'] == df_refined['category']].shape[0])
    
    df_one_unknown = df[(df['Category_person1'] == 'UNKNOWN')| (df['Category_person2'] == 'UNKNOWN')].loc[:,['idx','title','text','url','category','Category_person1','note_person1','Category_person2','note_person2']]
    
    
    return df_refined, df_conflict, df_one_unknown

df_refined, df_conflict, df_one_unknown = refine_df(df_merged)

df_refined.drop(['Category_person1', 'note_person1','Category_person2','note_person2','category'], axis=1, inplace=True)
df_refined.rename(columns={'re_category': 'category'}, inplace=True)

print (df_refined.shape)

print (df_original_kept.shape)

df_temp = pd.concat([df_original_kept, df_refined])

print (df_temp.shape)

df_refined:
2271
category == re_category:
1084
(2271, 6)
(8209, 6)
(10480, 6)


# Generate train, test data

In [6]:
df_temp = df_temp[df_temp['category'] != 'UNKNOWN']

# remove shorter text again
df_temp['char_length1'] = df_temp['text'].map(str).apply(len)
df_temp = df_temp[df_temp['char_length1'] >= 200]
print (df_temp.shape)

# remove duplicates
df_temp.drop_duplicates(['idx'],inplace=True)
print (df_temp.shape)

train, test = train_test_split(df_temp, test_size=0.2, random_state=42)
print (train.category.value_counts())

(10374, 7)
(9855, 7)
EN_US_SPORTS           1117
EN_US_ENTERTAINMENT     970
EN_US_SCIENCE           966
EN_US_POLITICS          926
EN_US_WORLD             923
EN_US_TECHNOLOGY        914
EN_US_NATIONAL          740
EN_US_BUSINESS          686
EN_US_LIFESTYLE         642
Name: category, dtype: int64


# Move files to s3

In [7]:
train.to_csv('train.csv', sep='|', index = False)
test.to_csv('test.csv', sep='|', index = False)
df_conflict.to_csv('conflict.csv', sep=',', index = False)
df_one_unknown.to_csv('one_unknown.csv', sep=',', index = False)

In [8]:
from io import StringIO
import boto3

def upload_to_s3(bucket, folder, file):
    s3 = boto3.resource('s3')
    data = open(file, "rb")
    key = 'tmp/data/classification/' + folder + file
    s3.Bucket(bucket).put_object(Key=key, Body=data)

bucket = 'smart-newsdev-dmp'
folder = 'data_handlabeling_cycle1/'

upload_to_s3(bucket, folder, 'conflict.csv')
upload_to_s3(bucket, folder, 'one_unknown.csv')
upload_to_s3(bucket, folder, 'train.csv')
upload_to_s3(bucket, folder, 'test.csv')

# Manually add tag (SageMaker = true) on the new files in s3