In [1]:
import os
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from collections import Counter

In [2]:
os.getcwd()

'/Users/nszoni/Desktop/repos/Python-Programming-and-Text-Analysis/Assignments'

In [3]:
os.chdir('/Users/nszoni/Desktop/repos/Python-Programming-and-Text-Analysis/Inputs/spam-reduced')

In [4]:
#get stopwords
nltk.download('stopwords')
stopwords_corpus = set(stopwords.words('english'))

#define punctuation remover
def remove_punc(s):
    string_nopunc=""
    for char in s:
        if char not in string.punctuation:
            string_nopunc=string_nopunc + char
    return(string_nopunc.lower())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nszoni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
def tokens_generator() -> pd.DataFrame:
    
    """
    
    1- removes punctuation and stopwords from each file.
    2- create a corpus, a list of words/tokens, containing all unique words that appear in all the .tex files.
    3- writes a file that is called `output.csv`. The file should contain the following variables:  
        i) a variable named id that records the name of each document,  
        ii) variables that record the number of times each token appears in each document and are named according to each token.
        
    """
    # init empty list of dataframes
    small_dfs = []
    
    # loop through files in directory
    for filename in os.listdir():
        
        # init list storing raw tokens
        tokenized=[]

        # init list storing raw tokens (no punc and stopwords)
        tokenized_clean=[]
        
        if filename.endswith(".txt"):

          # extract and clean tokens
          with open(filename, "r", encoding='utf-8') as file:
            for line in file:
                tokenized.extend(line.split())
            for element in tokenized:
                if remove_punc(element) not in stopwords_corpus:
                    tokenized_clean.append(remove_punc(element))
                else:
                    continue

            # convert to dict since counters can only store numeric values
            dict_counts = dict(Counter(tokenized_clean))

            # append id to file
            dict_counts.update({'id': filename.replace(".txt", "")})

            # move id to the beginning of dictionary
            reversed_dict = dict(reversed(list(dict_counts.items())))

            # append dataframe to empty list
            small_df = pd.DataFrame.from_dict([reversed_dict])
            small_dfs.append(small_df)

        else:
          print(f"Skipped {filename} because it is not a .txt file")
          continue
    
    # concat small dataframes generated by each file and convert values back to integer
    large_df = pd.concat(small_dfs, ignore_index=True).fillna(0)
    large_df.iloc[:,1:] = large_df.iloc[:,1:].astype(int)
    
    return large_df

In [14]:
large_df = tokens_generator()
large_df.head()

Skipped labels.csv because it is not a .txt file
Skipped .DS_Store because it is not a .txt file
Skipped output.csv because it is not a .txt file


Unnamed: 0,id,job,online,details,need,lor,u,2,tok,aiyah,...,tscs,83370,wk,every,cash,£250,chance,ring,change,dare
0,0ffaa21e,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,36967de7,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,6e6d1774,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,566b4a3e,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,fd188623,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# write output to csv
large_df.to_csv("output.csv", index=False)

In [16]:
# read labels and output
labels = pd.read_csv('labels.csv')
output = pd.read_csv('output.csv')

# left join labels
full_df = pd.merge(output, labels, how = 'left', on='id')

In [17]:
full_df.head()

Unnamed: 0,id,job,online,details,need,lor,u,2,tok,aiyah,...,83370,wk,every,cash,£250,chance,ring,change,dare,label
0,0ffaa21e,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
1,36967de7,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,ham
2,6e6d1774,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
3,566b4a3e,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
4,fd188623,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
