In [1]:
import pandas as pd
import json
import re
from langdetect import detect
import string

In [None]:
base_path = "C:/Users/ASUS/Desktop/Automatic-Privacy-Detection/data/"

def get_whisper(filename):
    serial = []
    text = []
    for line in open(filename, 'r'):
        new_data = json.loads(line)
        serial.append(new_data["serial"])
        text.append(new_data["text"])
    return pd.DataFrame({"serial": serial, "text": text})

def get_twitter(filename):
    text = []
    for line in open(filename, 'r'):
        splitted = line.split('\t')
        if len(splitted) == 4:
            text.append(splitted[2])
    return pd.DataFrame({"text": text})

def is_english(text):
    try:
        dl = detect(text)
    except:
#         print(text)
        return False
    return dl == 'en'

def replace_if_lang(df, text):
    def func(row):
        if is_english(row["text"]):
            return row["text"]
        return ""
    df['text'] = df.apply(func, axis=1).tolist()
    return df

def replace_in_df(df, regex, text):
    def func(row):
        return re.sub(regex, text, row["text"])
    df['text'] = df.apply(func, axis=1).tolist()
    return df

def clean_tweets(df, min_length=8):
    # delete the entire tweet (1/2)
    df = replace_in_df(df, r".*RT @.*", "") # retweets
    df = replace_in_df(df, r".*\[\[.*", "") # placeholders
    # delete only a part of the tweet
    df = replace_in_df(df, r"http(s?)://\S+", "") # links
    df = replace_in_df(df, r"@\S+", "") # mentions
    df = replace_in_df(df, r"#\S+", "") # hashtags
    df = replace_in_df(df, r"\n", " ") # newline
    df = replace_in_df(df, r"  +", " ") # multiple spaces
    df = replace_in_df(df, r"^ ", "") # space at the begin of line
    df = replace_in_df(df, r" $", "") # space at the end of line
    # delete the entire tweet (2/2)
    df = replace_if_lang(df, "") # texts not in English
    df = df[df['text'].map(len) >= min_length] # short text
    return df.drop_duplicates()

def get_datasets(whisper_filename, twitter_filename):
    def get_dataset(filename, func, cls):
        data = func(filename)
        data["class"] = cls
        data = data[['text', 'class']]
        return clean_tweets(data)
    data_w = get_dataset(whisper_filename, get_whisper, "sens")
#     data_t = get_dataset(twitter_filename, get_twitter, "ns")
    return data_w

def save_samples_data(num_samples, df_sens, df_ns, num_sens, num_ns, filename_prefix):
    for count in range(num_samples):
        df_sens[(num_sens*count):(num_sens*(count+1))].append(df_ns[(num_ns*count):(num_ns*(count+1))]).sample(frac=1, random_state=2**count).reset_index(drop=True).to_csv(filename_prefix+("0"+str(count+1))[-2:]+".csv", index=False)
    return

df_sens = get_datasets(base_path+'whisper/final-anonymized.jun14.jun16.whisper-part-000.json', 
                base_path+'twitter-cikm-2010/labeled_training_set.csv')

In [None]:
df_sens