In [1]:
import pandas as pd
from tqdm import tqdm

This script basically shows the extraction of the property vectors into a workable format. It also adds year, month, day, weekday and hour as features to work with them in the machine learning models.

This script is "just for show" and should not be run without a good reason, as it is not very performant and takes around 4 hrs to extract the data.

In [None]:
# Load the dataset
train_data = pd.read_csv('soc-redditHyperlinks-body.tsv', sep="\t")

# Make Dataframe for describing 'PROPERTIES'
train_data_expl = pd.DataFrame(columns=[
    'Num_Characters', 'Num_Characters_No_Whitespace', 'Fraction_Alphabetical',
    'Fraction_Digits', 'Fraction_Uppercase', 'Fraction_Whitespace',
    'Fraction_Special_Characters', 'Num_Words', 'Num_Unique_Words',
    'Num_Long_Words', 'Avg_Word_Length', 'Num_Unique_Stopwords',
    'Fraction_Stopwords', 'Num_Sentences', 'Num_Long_Sentences',
    'Avg_Characters_Per_Sentence', 'Avg_Words_Per_Sentence',
    'Automated_Readability_Index', 'Positive_Sentiment_VADER',
    'Negative_Sentiment_VADER', 'Compound_Sentiment_VADER',
    'LIWC_Funct', 'LIWC_Pronoun', 'LIWC_Ppron', 'LIWC_I', 'LIWC_We',
    'LIWC_You', 'LIWC_SheHe', 'LIWC_They', 'LIWC_Ipron', 'LIWC_Article',
    'LIWC_Verbs', 'LIWC_AuxVb', 'LIWC_Past', 'LIWC_Present', 'LIWC_Future',
    'LIWC_Adverbs', 'LIWC_Prep', 'LIWC_Conj', 'LIWC_Negate', 'LIWC_Quant',
    'LIWC_Numbers', 'LIWC_Swear', 'LIWC_Social', 'LIWC_Family', 'LIWC_Friends',
    'LIWC_Humans', 'LIWC_Affect', 'LIWC_Posemo', 'LIWC_Negemo', 'LIWC_Anx',
    'LIWC_Anger', 'LIWC_Sad', 'LIWC_CogMech', 'LIWC_Insight', 'LIWC_Cause',
    'LIWC_Discrep', 'LIWC_Tentat', 'LIWC_Certain', 'LIWC_Inhib', 'LIWC_Incl',
    'LIWC_Excl', 'LIWC_Percept', 'LIWC_See', 'LIWC_Hear', 'LIWC_Feel',
    'LIWC_Bio', 'LIWC_Body', 'LIWC_Health', 'LIWC_Sexual', 'LIWC_Ingest',
    'LIWC_Relativ', 'LIWC_Motion', 'LIWC_Space', 'LIWC_Time', 'LIWC_Work',
    'LIWC_Achiev', 'LIWC_Leisure', 'LIWC_Home', 'LIWC_Money', 'LIWC_Relig',
    'LIWC_Death', 'LIWC_Assent', 'LIWC_Dissent', 'LIWC_Nonflu', 'LIWC_Filler'
])

# Iterate through each row to map the list elements to respective columns
for index, row in tqdm(train_data.iterrows()):
    properties_str = row['PROPERTIES']
    properties_list = [float(value) for value in properties_str.split(',')]  # Split and convert to float
    train_data_expl.loc[index] = properties_list

In [None]:
train_data = pd.concat([train_data, train_data_expl], axis=1)

In [None]:
# Converting 'TIMESTAMP' to datetime format
train_data['TIMESTAMP'] = pd.to_datetime(train_data['TIMESTAMP'])

# Extract year, month, day, weekday, and hour
train_data['year'] = train_data['TIMESTAMP'].dt.year
train_data['month'] = train_data['TIMESTAMP'].dt.month
train_data['day'] = train_data['TIMESTAMP'].dt.day
train_data['weekday'] = train_data['TIMESTAMP'].dt.weekday  # Monday is 0, Sunday is 6
train_data['hour'] = train_data['TIMESTAMP'].dt.hour
train_data.to_csv('reddit_exploded.csv')