In [55]:
import pathlib 
import zipfile
import pandas as pd

In [93]:
BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "datasets"
EXPORT_DIR = DATASET_DIR / "exports"
EXPORT_DIR.mkdir(exist_ok=True, parents=True)
SPAM_DATASET_PATH = EXPORT_DIR / "spam-dataset.csv"


ZIPS_DIR = DATASET_DIR / "zips"
ZIPS_DIR.mkdir(exist_ok=True, parents=True)

SPAM_SMS_ZIP_PATH = ZIPS_DIR / 'sms_spam_dataset.zip'
SPAM_YOUTUBE_ZIP_PATH = ZIPS_DIR / 'youtube_spam_dataset.zip'

In [44]:
SMS_SPAM_ZIP = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
YOUTUBE_SPAM_ZIP = "https://archive.ics.uci.edu/ml/machine-learning-databases/00380/YouTube-Spam-Collection-v1.zip"

In [9]:
!curl $SMS_SPAM_ZIP -o $SPAM_SMS_ZIP_PATH
!curl $YOUTUBE_SPAM_ZIP -o $SPAM_YOUTUBE_ZIP_PATH

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0
 31  198k   31 63488    0     0  16531      0  0:00:12  0:00:03  0:00:09 16537
100  198k  100  198k    0     0  46025      0  0:00:04  0:00:04 --:--:-- 46042
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 29  159k   29 47616    0     0  27563      0  0:0

In [14]:
SPAM_CLASSIFIER_DIR = DATASET_DIR / "spam-classifier"
SMS_SPAM_DIR = SPAM_CLASSIFIER_DIR / "spam-sms"
YOUTUBE_SPAM_DIR = SPAM_CLASSIFIER_DIR / "youtube-spam"

SMS_SPAM_DIR.mkdir(exist_ok=True, parents=True)
YOUTUBE_SPAM_DIR.mkdir(exist_ok=True, parents=True)

In [45]:
def unzip(source_filename, dest_dir):
    with zipfile.ZipFile(source_filename) as zf:
        zf.extractall(dest_dir)

In [48]:
unzip(SPAM_YOUTUBE_ZIP_PATH, YOUTUBE_SPAM_DIR)
unzip(SPAM_SMS_ZIP_PATH, SMS_SPAM_DIR)

In [80]:
sms_spam_input_path = SMS_SPAM_DIR/ "SMSSpamCollection"
# sms_spam_input_path.read_text()
sms_df = pd.read_csv(sms_spam_input_path, sep='\t', header=None)
sms_df.columns = ["label", "text"]
sms_df['source'] = 'sms-spam'

In [81]:
sms_df.head()

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms-spam
1,ham,Ok lar... Joking wif u oni...,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms-spam
3,ham,U dun say so early hor... U c already then say...,sms-spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms-spam


In [82]:
my_dfs = []
for path in YOUTUBE_SPAM_DIR.glob("*.csv"):
    df = pd.read_csv(path)
    df.rename(columns={"CLASS":"raw_label", "CONTENT":"text"}, inplace=True)
    df['label'] = df['raw_label'].apply(lambda x: "spam" if str(x) == "1" else "ham")
    df['raw_source'] = str(path.name)
    df['source'] = 'youtube-spam'
    my_dfs.append(df)


In [83]:
yt_df = pd.concat(my_dfs)

In [84]:
yt_df

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,text,raw_label,label,raw_source,source
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1,spam,Youtube01-Psy.csv,youtube-spam
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1,spam,Youtube01-Psy.csv,youtube-spam
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1,spam,Youtube01-Psy.csv,youtube-spam
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1,spam,Youtube01-Psy.csv,youtube-spam
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1,spam,Youtube01-Psy.csv,youtube-spam
...,...,...,...,...,...,...,...,...
365,_2viQ_Qnc6-bMSjqyL1NKj57ROicCSJV5SwTrw-RFFA,Katie Mettam,2013-07-13T13:27:39.441000,I love this song because we sing it at Camp al...,0,ham,Youtube05-Shakira.csv,youtube-spam
366,_2viQ_Qnc6-pY-1yR6K2FhmC5i48-WuNx5CumlHLDAI,Sabina Pearson-Smith,2013-07-13T13:14:30.021000,I love this song for two reasons: 1.it is abou...,0,ham,Youtube05-Shakira.csv,youtube-spam
367,_2viQ_Qnc6_k_n_Bse9zVhJP8tJReZpo8uM2uZfnzDs,jeffrey jules,2013-07-13T12:09:31.188000,wow,0,ham,Youtube05-Shakira.csv,youtube-spam
368,_2viQ_Qnc6_yBt8UGMWyg3vh0PulTqcqyQtdE7d4Fl0,Aishlin Maciel,2013-07-13T11:17:52.308000,Shakira u are so wiredo,0,ham,Youtube05-Shakira.csv,youtube-spam


In [91]:
df = pd.concat([sms_df, yt_df[['label', 'text', 'source']]])

In [92]:
df

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms-spam
1,ham,Ok lar... Joking wif u oni...,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms-spam
3,ham,U dun say so early hor... U c already then say...,sms-spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms-spam
...,...,...,...
365,ham,I love this song because we sing it at Camp al...,youtube-spam
366,ham,I love this song for two reasons: 1.it is abou...,youtube-spam
367,ham,wow,youtube-spam
368,ham,Shakira u are so wiredo,youtube-spam


In [95]:
 df.to_csv(SPAM_DATASET_PATH, index=False)