In [16]:
import dask.dataframe as dd
import dask.bag as db
import dask.array as da
from os import path
import pandas as pd
import csv, sys
import multiprocessing
from nltk import word_tokenize
sys.path.append('../pyspark')
from utility import *

filename_train = "../dataset/train.csv"
filename_test = "../dataset/valid.csv"
NUMBER_OF_CPU = multiprocessing.cpu_count()

# Load data in Dask Dataframe

In [17]:
@metrics
def load_data(trainFile, testFile):
    panda_train = pd.read_csv(trainFile)
    panda_test = pd.read_csv(testFile)
    train_df = dd.from_pandas(panda_train, npartitions=NUMBER_OF_CPU)
    test_df = dd.from_pandas(panda_test, npartitions=NUMBER_OF_CPU)
    return train_df, test_df

In [36]:
train_df, test_df = load_data(filename_train, filename_test)

--------- BEFORE CALL TO FUNCTION ---------



CPU in use: 59.00%
Time on CPU: 3:18:30.120000
Memory in use: 3.50GiB
Disk in use: 6.10%
Disk free: 795.66GiB


 --------- AFTER CALL TO FUNCTION ---------



CPU in use: 13.60%
Time on CPU: 3:18:32.450000
Memory in use: 3.43GiB
Disk in use: 6.10%
Disk free: 795.66GiB

----------> Execution Time: 0.93966 seconds


Below, we can see that the dataframe is succesfully imported into a Dask dataframe. Now, we need to extract the information we need & build a trainig & testing Dataframe that we will use for the later stages. 

**We are setting the number of partitions relative to the number of CPUs available in your machine.**

In [None]:
print(f'Number of partitions: {NUMBER_OF_CPU}')
test_df.head()

In [18]:
def get_question(partition):
    title = partition.Title
    body = partition.Body
    return title + " " + body

def get_quality(partition):
    return partition.Y

@metrics
def clean_data(train, test):
    train["X_trn"] = train.map_partitions(get_question, meta=str)
    train["y_trn"] = train.map_partitions(get_quality, meta=str)
    test["X_tst"] = test.map_partitions(get_question, meta=str)
    test["y_tst"] = test.map_partitions(get_quality, meta=str)
    new_train = train.drop(['Id', 'Title', 'Body', 'CreationDate', 'Y', 'Tags'], axis=1)
    new_test = test.drop(['Id', 'Title', 'Body', 'CreationDate', 'Y', 'Tags'], axis=1)
    return new_train, new_test

In [43]:
training, testing = clean_data(train_df, test_df)

--------- BEFORE CALL TO FUNCTION ---------



CPU in use: 35.50%
Time on CPU: 3:27:38.560000
Memory in use: 3.44GiB
Disk in use: 6.10%
Disk free: 795.66GiB


 --------- AFTER CALL TO FUNCTION ---------



CPU in use: 25.80%
Time on CPU: 3:27:38.850000
Memory in use: 3.44GiB
Disk in use: 6.10%
Disk free: 795.66GiB

----------> Execution Time: 0.04751 seconds


Below, you can see our training Dask Dataframe that we created by concatenating the Title & Body and their corresponding output. 

In [41]:
print(testing.head())
print(f'\n{type(testing)}')
print(testing.dtypes)

                                               X_tst    y_tst
0  How to get all the child records from differen...  LQ_EDIT
1  Retrieve all except some data of the another t...  LQ_EDIT
2  Pandas: read_html <p>I'm trying to extract US ...       HQ
3  Reader Always gimme NULL I'm so new to C#, I w...  LQ_EDIT
4  php rearrange array elements based on conditio...  LQ_EDIT

<class 'dask.dataframe.core.DataFrame'>
X_tst    object
y_tst    object
dtype: object


# Preprocess Data
In this part, we will preprocess our data by cleaning the text. Then, we will build a bag of word model.

### Cleaning data
Just like in Spark, we clean our data by going through the following steps:

- Lowercase questions
- Tokenize each question
- Remove all stopwords

In [19]:
STOPWORDS = [] 
with open("../dataset/stop_words.txt", "r") as r:
    STOPWORDS = r.read().split('\n')

@metrics
def preprocess_data(training, testing):
    if isinstance(training.head().loc[0, 'X_trn'], str):
        training["X_trn"] = training["X_trn"].str.lower()
        training["X_trn"] = training["X_trn"].replace(to_replace="(\\W)+", value=' ', regex=True)
        training['X_trn'] = training['X_trn'].apply(lambda x: [token for token in x.split(" ")], meta=str)
        training['X_trn'] = training['X_trn'].apply(lambda x: [token for token in x if token not in STOPWORDS], meta=str)
        training['X_trn'] = training['X_trn'].apply(lambda x: [token for token in x if token], meta=str)
        training['X_trn'] = training['X_trn'].apply(lambda x: " ".join(x), meta=str)
        
    if isinstance(testing.head().loc[0, 'X_tst'], str):
        testing["X_tst"] = testing["X_tst"].str.lower()
        testing["X_tst"] = testing["X_tst"].replace(to_replace="(\\W)+", value=' ', regex=True)
        testing['X_tst'] = testing['X_tst'].apply(lambda x: [token for token in x.split(" ")], meta=str)
        testing['X_tst'] = testing['X_tst'].apply(lambda x: [token for token in x if token not in STOPWORDS], meta=str)
        testing['X_tst'] = testing['X_tst'].apply(lambda x: [token for token in x if token], meta=str)
        testing['X_tst'] = testing['X_tst'].apply(lambda x: " ".join(x), meta=str)

In [52]:
preprocess_data(training, testing)

--------- BEFORE CALL TO FUNCTION ---------



CPU in use: 25.90%
Time on CPU: 3:31:19.060000
Memory in use: 3.43GiB
Disk in use: 6.10%
Disk free: 795.66GiB


 --------- AFTER CALL TO FUNCTION ---------



CPU in use: 0.00%
Time on CPU: 3:31:26.760000
Memory in use: 3.44GiB
Disk in use: 6.10%
Disk free: 795.66GiB

----------> Execution Time: 6.60484 seconds


In [53]:
training.head()

Unnamed: 0,X_trn,y_trn
0,java repeat task every random seconds p alread...,LQ_CLOSE
1,java optionals immutable p like understand jav...,HQ
2,text overlay image darkened opacity react nati...,HQ
3,ternary operator swift picky p question simple...,HQ
4,hide show fab scale animation p using custom f...,HQ


### Create BoW model
In this part, we want to create a bag of word model. The X will be a DataFrame where each column represents a word, each row represents a question & the number of times the word occur in the question.

In [20]:
from dask_ml.feature_extraction.text import CountVectorizer
from dask_ml import preprocessing

def compute_chunks(X_train, y_train, X_test, y_test):
    X_train.compute_chunk_sizes()
    y_train.compute_chunk_sizes()
    X_test.compute_chunk_sizes()
    y_test.compute_chunk_sizes()
    
def convert_X_data(train, test):
    X_train = train.map_blocks(lambda x: x.toarray(), dtype=int)
    X_test = test.map_blocks(lambda x: x.toarray(), dtype=int)
    return X_train, X_test

@metrics
def build_bow_model(training, testing):
    vectorizer = CountVectorizer()
    encoder = preprocessing.LabelEncoder()
    
    print("Converting to Dask Databags...")
    X_train_db = db.from_sequence(training['X_trn'], npartitions=NUMBER_OF_CPU)
    X_test_db = db.from_sequence(testing['X_tst'], npartitions=NUMBER_OF_CPU)

    print("Building BoW...")
    X_model = vectorizer.fit(X_train_db)
    X_train = X_model.transform(X_train_db)
    X_test = X_model.transform(X_test_db)

    print("Indexing strings...")
    y_model = encoder.fit(training['y_trn'])
    y_train = y_model.transform(training['y_trn'])
    y_test = y_model.transform(testing['y_tst'])
    
    print("Computing chunks...")
    compute_chunks(X_train, y_train, X_test, y_test)
    
    print("Re-convert to Dask Array")
    Xtrain, Xtest = convert_X_data(X_train, X_test)
        
    return Xtrain, y_train, Xtest, y_test

In [72]:
vectorizer, encoder = init_bow(training, testing)
X_model, y_model, X_train_db, X_test_db = build_bow(vectorizer, encoder)
X_train, y_train, X_test, y_test = transform_bow(X_model, y_model, X_train_db, X_test_db, training, testing)

--------- BEFORE CALL TO FUNCTION ---------



CPU in use: 20.80%
Time on CPU: 4:09:25.560000
Memory in use: 2.17GiB
Disk in use: 6.10%
Disk free: 795.65GiB
Converting to Dask Databags...
Building BoW...
Indexing strings...
Computing chunks...


 --------- AFTER CALL TO FUNCTION ---------



CPU in use: 0.00%
Time on CPU: 4:21:54.900000
Memory in use: 2.57GiB
Disk in use: 6.10%
Disk free: 795.65GiB

----------> Execution Time: 703.77465 seconds


# Train model
Let's train our model using our training set!

In [21]:
from dask_ml.wrappers import ParallelPostFit
from dask_ml.naive_bayes import GaussianNB

@metrics
def train_model(x_train, y_train):
    clf = ParallelPostFit(estimator = GaussianNB(), scoring='accuracy')
    clf.fit(x_train, y_train)
    return clf

In [1]:
clf = train_model(X_train, y_train)

NameError: name 'train_model' is not defined

In [77]:
predictions = clf.predict(xtest)

# Run entire process

In [23]:
print("###############LOADING DATA###############")
train_df, test_df = load_data(filename_train, filename_test)

print("###############CLEANING DATA###############")
training, testing = clean_data(train_df, test_df)

print("###############PREPROCESSING DATA###############")
preprocess_data(training, testing)

print("###############BUILDING BOW###############")
X_train, y_train, Xtest, y_test = build_bow_model(training, testing)

print("###############TRAINING DATA###############")
clf = train_model(X_train, y_train)

###############LOADING DATA###############
--------- BEFORE CALL TO FUNCTION ---------



CPU in use: 43.10%
Time on CPU: 0:47:22.450000
Memory in use: 3.29GiB
Disk in use: 6.10%
Disk free: 795.00GiB


 --------- AFTER CALL TO FUNCTION ---------



CPU in use: 47.50%
Time on CPU: 0:47:24.630000
Memory in use: 3.28GiB
Disk in use: 6.10%
Disk free: 795.00GiB

----------> Execution Time: 1.00071 seconds
###############CLEANING DATA###############
--------- BEFORE CALL TO FUNCTION ---------



CPU in use: 48.30%
Time on CPU: 0:47:24.930000
Memory in use: 3.29GiB
Disk in use: 6.10%
Disk free: 795.00GiB


 --------- AFTER CALL TO FUNCTION ---------



CPU in use: 14.80%
Time on CPU: 0:47:25.130000
Memory in use: 3.29GiB
Disk in use: 6.10%
Disk free: 795.00GiB

----------> Execution Time: 0.03078 seconds
###############PREPROCESSING DATA###############
--------- BEFORE CALL TO FUNCTION ---------



CPU in use: 28.80%
Time on CPU: 0:47:25.300000
Memory in use: 3.29GiB
Disk in use: 6.10%
Disk f

KeyboardInterrupt: 

In [15]:
process_info()

[]