# Extract, Review, and Combine Datasets

## Data Exploration

In [1]:
# Dependencies
import os
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
# Datasets directories
PROJ_DIR = Path().resolve().parent
DATASETS_DIR = os.path.join(PROJ_DIR, "datasets")
SMS_SPAM_DIR = os.path.join(DATASETS_DIR, "sms_spam_collection")
YOUTUBE_SPAM_DIR = os.path.join(DATASETS_DIR, "youtube_spam_collection")

In [3]:
# Exploring the files that we want to use
# for path in Path(SMS_SPAM_DIR).glob("*"):
#     print(path)
#     try:
#         print(path.read_text())
#     except:
#         pass

for path in Path(YOUTUBE_SPAM_DIR).glob("*"):
    print(path)
    try:
        print(path.read_text())
    except:
        pass

C:\Users\maeva\Desktop\Spam-Classifier-ML-As-API\datasets\youtube_spam_collection\Youtube01-Psy.csv
C:\Users\maeva\Desktop\Spam-Classifier-ML-As-API\datasets\youtube_spam_collection\Youtube02-KatyPerry.csv
C:\Users\maeva\Desktop\Spam-Classifier-ML-As-API\datasets\youtube_spam_collection\Youtube03-LMFAO.csv
C:\Users\maeva\Desktop\Spam-Classifier-ML-As-API\datasets\youtube_spam_collection\Youtube04-Eminem.csv
C:\Users\maeva\Desktop\Spam-Classifier-ML-As-API\datasets\youtube_spam_collection\Youtube05-Shakira.csv
C:\Users\maeva\Desktop\Spam-Classifier-ML-As-API\datasets\youtube_spam_collection\__MACOSX


In [4]:
# Set the input paths for the actual data
sms_spam_input_path = os.path.join(SMS_SPAM_DIR, "SMSSpamCollection") # TSV File
# Path(sms_spam_input_path).read_text()

youtube_spam_input_path = YOUTUBE_SPAM_DIR # CSV Files
# Path(youtube_spam_input_path).read_text()

## Data Manipulation

Now, let's start to manipulate those data and transform them

### For the SMS Data

In [5]:
# Read the sms data into a dataframe
sms_df = pd.read_csv(
    sms_spam_input_path, 
    header=None,
    sep="\t" # sms_spam_input_path is a TSV file
) 

# Set the column headers
sms_df.columns = ("label", "text")

# Add the category source
sms_df["source"] = "sms-spam"

# Check result
display(sms_df.shape)
display(sms_df.head())
display(sms_df.tail())

# ham = Good value
# spam = Bad value

(5572, 3)

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms-spam
1,ham,Ok lar... Joking wif u oni...,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms-spam
3,ham,U dun say so early hor... U c already then say...,sms-spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms-spam


Unnamed: 0,label,text,source
5567,spam,This is the 2nd time we have tried 2 contact u...,sms-spam
5568,ham,Will ü b going to esplanade fr home?,sms-spam
5569,ham,"Pity, * was in mood for that. So...any other s...",sms-spam
5570,ham,The guy did some bitching but I acted like i'd...,sms-spam
5571,ham,Rofl. Its true to its name,sms-spam


### For the YouTube Data

In [6]:
# Reading each csv file will create a df
all_dfs = []

# We only want to grab the .csv files in the input_path
for path in Path(youtube_spam_input_path).glob("*.csv"):
    
    # Load them into a dataframe
    temp_df = pd.read_csv(path)
    
    # Set the column headers
    temp_df.rename(columns={
        "CLASS": "raw_label", # We want to keep the raw_label the same, not converting to 0 and 1 as for sms_df
        "CONTENT": "text",
        "COMMENT_ID": "comment_id",
        "AUTHOR": "author",
        "DATE": "date"
    }, inplace=True)
    
    # Add the file source
    temp_df["source_file"] = path.name
    
    # Add the category source
    temp_df["source"] = "youtube-spam"
    
    # Append to the list of all dataframes
    all_dfs.append(temp_df)
    
# Finally, concatenate all the dataframes
yt_df = pd.concat(all_dfs).reset_index(drop=True)

# Here, we can derived the 0-1 labels from the raw_label
# Vectorized version 
yt_df["label"] = np.where(yt_df["raw_label"] == 1, "spam", "ham") # Iterative version: yt_df["label"] = yt_df["raw_label"].apply(lambda x: "spam" if str(x) == "1" else "ham")

# Check result
display(yt_df.shape)
display(yt_df.head())
display(yt_df.tail())

(1956, 8)

Unnamed: 0,comment_id,author,date,text,raw_label,source_file,source,label
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1,Youtube01-Psy.csv,youtube-spam,spam
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1,Youtube01-Psy.csv,youtube-spam,spam
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1,Youtube01-Psy.csv,youtube-spam,spam
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1,Youtube01-Psy.csv,youtube-spam,spam
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1,Youtube01-Psy.csv,youtube-spam,spam


Unnamed: 0,comment_id,author,date,text,raw_label,source_file,source,label
1951,_2viQ_Qnc6-bMSjqyL1NKj57ROicCSJV5SwTrw-RFFA,Katie Mettam,2013-07-13T13:27:39.441000,I love this song because we sing it at Camp al...,0,Youtube05-Shakira.csv,youtube-spam,ham
1952,_2viQ_Qnc6-pY-1yR6K2FhmC5i48-WuNx5CumlHLDAI,Sabina Pearson-Smith,2013-07-13T13:14:30.021000,I love this song for two reasons: 1.it is abou...,0,Youtube05-Shakira.csv,youtube-spam,ham
1953,_2viQ_Qnc6_k_n_Bse9zVhJP8tJReZpo8uM2uZfnzDs,jeffrey jules,2013-07-13T12:09:31.188000,wow,0,Youtube05-Shakira.csv,youtube-spam,ham
1954,_2viQ_Qnc6_yBt8UGMWyg3vh0PulTqcqyQtdE7d4Fl0,Aishlin Maciel,2013-07-13T11:17:52.308000,Shakira u are so wiredo,0,Youtube05-Shakira.csv,youtube-spam,ham
1955,_2viQ_Qnc685RPw1aSa1tfrIuHXRvAQ2rPT9R06KTqA,Latin Bosch,2013-07-12T22:33:27.916000,Shakira is the best dancer,0,Youtube05-Shakira.csv,youtube-spam,ham


In [7]:
# Vectorized version: np.where()
# %timeit yt_df["label"] = np.where(yt_df["raw_label"] == 1, "spam", "ham")
# Iterative version: .apply()
# %timeit yt_df["label"] = yt_df["raw_label"].apply(lambda x: "spam" if str(x) == "1" else "ham")
# Iterative version: .map()
# %timeit yt_df["label"] = yt_df["raw_label"].map(lambda x: "spam" if str(x) == "1" else "ham")

### Combining The 2 Datasets

In [8]:
# We only want the label, text, and source columns
spam_df = pd.concat(
    [sms_df, yt_df], 
    ignore_index=True
)[["label", "text", "source"]]

# Check result
display(spam_df.shape)
display(spam_df.head())
display(spam_df.tail())

(7528, 3)

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms-spam
1,ham,Ok lar... Joking wif u oni...,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms-spam
3,ham,U dun say so early hor... U c already then say...,sms-spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms-spam


Unnamed: 0,label,text,source
7523,ham,I love this song because we sing it at Camp al...,youtube-spam
7524,ham,I love this song for two reasons: 1.it is abou...,youtube-spam
7525,ham,wow,youtube-spam
7526,ham,Shakira u are so wiredo,youtube-spam
7527,ham,Shakira is the best dancer,youtube-spam


## Exports

In [9]:
# Export directory
EXPORTS_DIR = os.path.join(DATASETS_DIR, "exports")
SPAM_DATASET_PATH = os.path.join(EXPORTS_DIR, "spam-dataset.csv")

# Create the directory if needed
if not os.path.isdir(EXPORTS_DIR): 
    os.mkdir(EXPORTS_DIR)
    
# Export the combined spam dataset into a file
spam_df.to_csv(SPAM_DATASET_PATH, index=False)