In [22]:
import pandas as pd
import json
import re
import nltk
from langdetect import detect
from html import unescape
from tabulate import tabulate

In [23]:
base_path = "C:/Users/ASUS/Desktop/Automatic-Privacy-Detection/data/"

training_dataset = base_path+'twitter-cikm-2010/labeled_training_set.csv'
test_dataset = base_path+'twitter-cikm-2010/labeled_test_set.csv'


def clean_tweets(df, min_length=8):
    # Define regex patterns for matching
    rt_pattern = r"^RT @\S+: "
    link_pattern = r"http\S+"
    mention_pattern = r"@\S+"
    hashtag_pattern = r"#\S+"
    unicode_pattern = r"\u0092"

    # Define function for removing regex patterns from text
    def remove_patterns(text, patterns):
        for pattern in patterns:
            text = re.sub(pattern, "", text)
        return text.replace('\u0092', "'")

    # Define function for checking if text is in English
    def is_english(text):
        try:
            lang = detect(text)
            return lang == "en"
        except:
            return False

    # Remove unwanted patterns from text
    df['text'] = df['text'].apply(lambda x: remove_patterns(x, [rt_pattern, link_pattern, mention_pattern, hashtag_pattern, unicode_pattern]))

    # Remove non-English text and short text
    df = df[df['text'].apply(lambda x: is_english(x) and len(x) >= min_length)]

    return df.drop_duplicates()


# read in the CSV file
df = pd.read_csv(training_dataset)

# create a new column based on the values of the other columns
df['class'] = df.apply(lambda x: 0 if x['Emotional_disclosure'] == 0 and x['Information_disclosure'] == 0 and x['Info_support'] == 0 and x['Emo_support'] == 0 else 1, axis=1)

# rename the "full_text" column to "text"
df = df.rename(columns={'full_text': 'text'})

# drop all other columns except for "text" and "class"
df = df[['text', 'class']]
df = clean_tweets(df)
df.to_csv(base_path+'training_set.csv', index=False)
# print the resulting dataframe
print(df)

                                                    text  class
0      Get two nice notebooks and write it down for e...      0
1                I<U+0092>m sobbing reading this thread!      1
2                               Hope you have a nice day      0
3      My wife came in when I was around half way thr...      1
4            I am crying a lot of happy tears right now.      1
...                                                  ...    ...
12855            That's wild and a hell of a close call.      1
12856              That's pretty much what my wife said.      0
12857  I got into her line because she'd started work...      1
12858         We've been friends for about a year or so.      1
12859  As for how we got together, I was taking care ...      1

[12485 rows x 2 columns]


In [24]:
# Group the dataframe by the 'class' column
grouped = df.groupby('class')

# Get the group with class = 0
class_0 = grouped.get_group(0)

# Get the group with class = 1
class_1 = grouped.get_group(1)

# Print the two resulting dataframes
print("Non-sensitive:\n", class_0)
print("\nSensitive:\n", class_1)


Non-sensitive:
                                                     text  class
0      Get two nice notebooks and write it down for e...      0
2                               Hope you have a nice day      0
21     I don<U+0092>t think he<U+0092>s seen my message.      0
24     It<U+0092>s not uncommon for someone (especial...      0
26     It takes longer but it is much safer and well ...      0
...                                                  ...    ...
12843                                    OP is a girl...      0
12844                         He'll find out eventually.      0
12847                     I don't understand your title.      0
12848                               Insulting me now eh?      0
12856              That's pretty much what my wife said.      0

[4625 rows x 2 columns]

Sensitive:
                                                     text  class
1                I<U+0092>m sobbing reading this thread!      1
3      My wife came in when I was around half way 

In [25]:
# read in the CSV file
df = pd.read_csv(test_dataset)

# create a new column based on the values of the other columns
df['class'] = df.apply(lambda x: 0 if x['Emotional_disclosure'] == 0 and x['Information_disclosure'] == 0 and x['Info_support'] == 0 and x['Emo_support'] == 0 else 1, axis=1)

# rename the "full_text" column to "text"
df = df.rename(columns={'full_text': 'text'})

# drop all other columns except for "text" and "class"
df = df[['text', 'class']]
df = clean_tweets(df)
df.to_csv(base_path+'test_set.csv', index=False)
# print the resulting dataframe
print(df)

                                                   text  class
0     As time goes on, it's easier to recognize what...      0
1     it's a lot to handle, you don't have to take i...      1
2     If you got issues with your apartment, talk to...      0
3     Check to see if your son's school has any open...      0
4     Maybe you should try to take an outside perspe...      1
...                                                 ...    ...
4995  Stand up if someone disrespects you, let them ...      1
4996  I think you should just continue what you are ...      1
4997  Get a babysitter, take her to a quiet place an...      1
4998  Try to sort out your feelings and talk to her ...      1
4999  Speaking to a grief councillor is also somethi...      1

[4867 rows x 2 columns]


In [26]:
# Group the dataframe by the 'class' column
grouped = df.groupby('class')

# Get the group with class = 0
class_0 = grouped.get_group(0)

# Get the group with class = 1
class_1 = grouped.get_group(1)

# Print the two resulting dataframes
print("Non-sensitive:\n", class_0)
print("\nSensitive:\n", class_1)

Non-sensitive:
                                                    text  class
0     As time goes on, it's easier to recognize what...      0
2     If you got issues with your apartment, talk to...      0
3     Check to see if your son's school has any open...      0
5     I say tell everyone, for better or worse you w...      0
7     Maybe if you can see it as the enemy it is and...      0
...                                                 ...    ...
4374  The right person at the wrong time is still th...      0
4446  We're open about them, because we accept that ...      0
4525  So I need to have a solid "base" or I risk fee...      0
4736  It<U+0092>s kind of like in the previous gener...      0
4809  You put yourself out there and try to be a goo...      0

[462 rows x 2 columns]

Sensitive:
                                                    text  class
1     it's a lot to handle, you don't have to take i...      1
4     Maybe you should try to take an outside perspe...      1
6 