In [105]:
import dask.dataframe as dd
import dask.bag as db
import dask.array as da
from os import path
import pandas as pd
import csv
import multiprocessing
from nltk import word_tokenize

filename_train = "../dataset/train.csv"
filename_test = "../dataset/valid.csv"
NUMBER_OF_CPU = multiprocessing.cpu_count()

# Load data in Dask Dataframe

In [106]:
panda_train = pd.read_csv(filename_train)[0:100]
panda_test = pd.read_csv(filename_test)[0:100]

train_df = dd.from_pandas(panda_train, npartitions=NUMBER_OF_CPU)
test_df = dd.from_pandas(panda_test, npartitions=NUMBER_OF_CPU)

Below, we can see that the dataframe is succesfully imported into a Dask dataframe. Now, we need to extract the information we need & build a trainig & testing Dataframe that we will use for the later stages. 

**We are setting the number of partitions relative to the number of CPUs available in your machine.**

In [None]:
print(f'Number of partitions: {NUMBER_OF_CPU}')
test_df.head()

In [107]:
def get_question(partition):
    title = partition.Title
    body = partition.Body
    return title + " " + body

def get_quality(partition):
    return partition.Y

## Initialize training & testing dask dataframe
train_df["X_trn"] = train_df.map_partitions(get_question, meta=str)
train_df["y_trn"] = train_df.map_partitions(get_quality, meta=str)
test_df["X_tst"] = test_df.map_partitions(get_question, meta=str)
test_df["y_tst"] = test_df.map_partitions(get_quality, meta=str)

training = train_df.drop(['Id', 'Title', 'Body', 'CreationDate', 'Y', 'Tags'], axis=1)
testing = test_df.drop(['Id', 'Title', 'Body', 'CreationDate', 'Y', 'Tags'], axis=1)

Below, you can see our training Dask Dataframe that we created by concatenating the Title & Body and their corresponding output. 

In [None]:
print(testing.head())
print(f'\n{type(testing)}')
print(testing.dtypes)

# Preprocess Data
In this part, we will preprocess our data by cleaning the text. Then, we will build a bag of word model.

### Cleaning data
Just like in Spark, we clean our data by going through the following steps:

- Lowercase questions
- Tokenize each question
- Remove all stopwords

In [108]:
STOPWORDS = [] 
with open("../dataset/stop_words.txt", "r") as r:
    STOPWORDS = r.read().split('\n')

if isinstance(training.head().loc[0, 'X_trn'], str):
    training["X_trn"] = training["X_trn"].str.lower()
    training["X_trn"] = training["X_trn"].replace(to_replace="(\\W)+", value=' ', regex=True)
    training['X_trn'] = training['X_trn'].apply(lambda x: [token for token in x.split(" ")], meta=str)
    training['X_trn'] = training['X_trn'].apply(lambda x: [token for token in x if token not in STOPWORDS], meta=str)
    training['X_trn'] = training['X_trn'].apply(lambda x: [token for token in x if token], meta=str)
    training['X_trn'] = training['X_trn'].apply(lambda x: " ".join(x), meta=str)
    
if isinstance(testing.head().loc[0, 'X_tst'], str):
    testing["X_tst"] = testing["X_tst"].str.lower()
    testing["X_tst"] = testing["X_tst"].replace(to_replace="(\\W)+", value=' ', regex=True)
    testing['X_tst'] = testing['X_tst'].apply(lambda x: [token for token in x.split(" ")], meta=str)
    testing['X_tst'] = testing['X_tst'].apply(lambda x: [token for token in x if token not in STOPWORDS], meta=str)
    testing['X_tst'] = testing['X_tst'].apply(lambda x: [token for token in x if token], meta=str)
    testing['X_tst'] = testing['X_tst'].apply(lambda x: " ".join(x), meta=str)

In [None]:
# print(training.head().loc[0, 'X_trn'])
training.head()

### Create BoW model
In this part, we want to create a bag of word model. The X will be a DataFrame where each column represents a word, each row represents a question & the number of times the word occur in the question.

In [126]:
from dask_ml.feature_extraction.text import CountVectorizer
from dask_ml import preprocessing

vectorizer = CountVectorizer()
encoder = preprocessing.LabelEncoder()

X_train_db = db.from_sequence(training['X_trn'], npartitions=NUMBER_OF_CPU)
X_test_db = db.from_sequence(testing['X_tst'], npartitions=NUMBER_OF_CPU)

X_model = vectorizer.fit(X_train_db)
X_train = X_model.transform(X_train_db)
X_test = X_model.transform(X_test_db)

y_model = encoder.fit(training['y_trn'])
y_train = y_model.transform(training['y_trn'])
y_test = y_model.transform(testing['y_tst'])

In [127]:
X_train.compute_chunk_sizes()
y_train.compute_chunk_sizes()
X_test.compute_chunk_sizes()
y_test.compute_chunk_sizes()

Unnamed: 0,Array,Chunk
Bytes,800 B,136 B
Shape,"(100,)","(17,)"
Count,168 Tasks,6 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 800 B 136 B Shape (100,) (17,) Count 168 Tasks 6 Chunks Type int64 numpy.ndarray",100  1,

Unnamed: 0,Array,Chunk
Bytes,800 B,136 B
Shape,"(100,)","(17,)"
Count,168 Tasks,6 Chunks
Type,int64,numpy.ndarray


In [130]:
%%time
# np_x = dd.from_dask_array(X_train)
# np_y = dd.from_dask_array(y_train)

# np_test_x = dd.from_dask_array(X_test)
print(X_train)
print(y_train)
print(X_test)
print(y_test)

dask.array<from-bag-_count_vectorizer_transform, shape=(100, 2451), dtype=int64, chunksize=(17, 2451), chunktype=scipy.csr_matrix>
dask.array<_check_and_search_block, shape=(100,), dtype=int64, chunksize=(17,), chunktype=numpy.ndarray>
dask.array<from-bag-_count_vectorizer_transform, shape=(100, 2451), dtype=int64, chunksize=(17, 2451), chunktype=scipy.csr_matrix>
dask.array<_check_and_search_block, shape=(100,), dtype=int64, chunksize=(17,), chunktype=numpy.ndarray>
CPU times: user 521 µs, sys: 0 ns, total: 521 µs
Wall time: 296 µs


# Train model
Let's train our model using our training set!

In [125]:
%%time
from dask_ml.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train, y_train)
# predictions = clf.predict(np_test_x)
# predictions.compute()

IndexError: Index dimension must be <= 2

In [45]:
clf.predict()

ValueError: Stacked arrays must have the same shape. The first array had shape (nan,), while array 2 has shape (nan,).

In [76]:
from dask_ml import datasets
a,b = datasets.make_classification(chunks=50)
jure = GaussianNB()
jure.fit(a, b)
a

Unnamed: 0,Array,Chunk
Bytes,16.00 kB,8.00 kB
Shape,"(100, 20)","(50, 20)"
Count,2 Tasks,2 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 16.00 kB 8.00 kB Shape (100, 20) (50, 20) Count 2 Tasks 2 Chunks Type float64 numpy.ndarray",20  100,

Unnamed: 0,Array,Chunk
Bytes,16.00 kB,8.00 kB
Shape,"(100, 20)","(50, 20)"
Count,2 Tasks,2 Chunks
Type,float64,numpy.ndarray
