In [1]:
import dask.dataframe as dd
import dask.bag as db
from os import path
import pandas as pd
import csv
import multiprocessing
from nltk import word_tokenize

filename_train = "../dataset/train.csv"
filename_test = "../dataset/valid.csv"
NUMBER_OF_CPU = multiprocessing.cpu_count()

# Load data in Dask Dataframe

In [10]:
panda_train = pd.read_csv(filename_train)
panda_test = pd.read_csv(filename_test)

train_df = dd.from_pandas(panda_train, npartitions=1)
test_df = dd.from_pandas(panda_test, npartitions=1)

Below, we can see that the dataframe is succesfully imported into a Dask dataframe. Now, we need to extract the information we need & build a trainig & testing Dataframe that we will use for the later stages. 

**We are setting the number of partitions relative to the number of CPUs available in your machine.**

In [162]:
print(f'Number of partitions: {NUMBER_OF_CPU}')
test_df.head()

Number of partitions: 6


Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552974,How to get all the child records from differen...,I am having 4 different tables like \r\nselect...,<sql><sql-server>,2016-01-01 01:44:52,LQ_EDIT
1,34554721,Retrieve all except some data of the another t...,I have two table m_master and tbl_appointment\...,<php><mysql><sql><codeigniter><mysqli>,2016-01-01 08:43:50,LQ_EDIT
2,34555135,Pandas: read_html,<p>I'm trying to extract US states from wiki U...,<python><pandas>,2016-01-01 09:55:22,HQ
3,34555448,Reader Always gimme NULL,"I'm so new to C#, I wanna make an application ...",<sql-server><c#-4.0>,2016-01-01 10:43:45,LQ_EDIT
4,34555752,php rearrange array elements based on condition,basically i have this array:\r\n\r\n array(...,<php>,2016-01-01 11:34:09,LQ_EDIT


In [11]:
def get_question(partition):
    title = partition.Title
    body = partition.Body
    return title + " " + body

def get_quality(partition):
    return partition.Y

## Initialize training & testing dask dataframe
train_df["X_trn"] = train_df.map_partitions(get_question, meta=str)
train_df["y_trn"] = train_df.map_partitions(get_quality, meta=str)
test_df["X_tst"] = test_df.map_partitions(get_question, meta=str)
test_df["y_tst"] = test_df.map_partitions(get_quality, meta=str)

training = train_df.drop(['Id', 'Title', 'Body', 'CreationDate', 'Y', 'Tags'], axis=1)
testing = test_df.drop(['Id', 'Title', 'Body', 'CreationDate', 'Y', 'Tags'], axis=1)

Below, you can see our training Dask Dataframe that we created by concatenating the Title & Body and their corresponding output. 

In [None]:
print(testing.head())
print(f'\n{type(testing)}')
print(testing.dtypes)

# Preprocess Data
In this part, we will preprocess our data by cleaning the text. Then, we will build a bag of word model.

### Cleaning data
Just like in Spark, we clean our data by going through the following steps:

- Lowercase questions
- Tokenize each question
- Remove all stopwords

In [12]:
STOPWORDS = [] 
with open("../dataset/stop_words.txt", "r") as r:
    STOPWORDS = r.read().split('\n')

if isinstance(training.head().loc[0, 'X_trn'], str):
    training["X_trn"] = training["X_trn"].str.lower()
    training["X_trn"] = training["X_trn"].replace(to_replace="(\\W)+", value=' ', regex=True)
    training['X_trn'] = training['X_trn'].apply(lambda x: [token for token in x.split(" ")], meta=str)
    training['X_trn'] = training['X_trn'].apply(lambda x: [token for token in x if token not in STOPWORDS], meta=str)
    training['X_trn'] = training['X_trn'].apply(lambda x: [token for token in x if token], meta=str)
    training['X_trn'] = training['X_trn'].apply(lambda x: " ".join(x), meta=str)
    
if isinstance(testing.head().loc[0, 'X_tst'], str):
    testing["X_tst"] = testing["X_tst"].str.lower()
    testing["X_tst"] = testing["X_tst"].replace(to_replace="(\\W)+", value=' ', regex=True)
    testing['X_tst'] = testing['X_tst'].apply(lambda x: [token for token in x.split(" ")], meta=str)
    testing['X_tst'] = testing['X_tst'].apply(lambda x: [token for token in x if token not in STOPWORDS], meta=str)
    testing['X_tst'] = testing['X_tst'].apply(lambda x: [token for token in x if token], meta=str)
    testing['X_tst'] = testing['X_tst'].apply(lambda x: " ".join(x), meta=str)

In [174]:
print(training.head().loc[0, 'X_trn'])
testing.head()

java repeat task every random seconds p already familiar repeating tasks every n seconds using java util timer java util timertask lets say want print hello world console every random seconds 1 5 unfortunately bit rush code show far help would apriciated p


Unnamed: 0,X_tst,y_tst
0,get child records different tables based given...,LQ_EDIT
1,retrieve except data another table two table m...,LQ_EDIT
2,pandas read_html p trying extract us states wi...,HQ
3,reader always gimme null new c wanna make appl...,LQ_EDIT
4,php rearrange array elements based condition b...,LQ_EDIT


### Create BoW model
In this part, we want to create a bag of word model. The X will be a DataFrame where each column represents a word, each row represents a question & the number of times the word occur in the question.

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing

vectorizer = CountVectorizer()
encoder = preprocessing.LabelEncoder()

X_train = pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(training['X_trn']))
y_train = encoder.fit_transform(training['y_trn'])
X_test = pd.DataFrame.sparse.from_spmatrix(vectorizer.transform(testing['X_tst']))
y_test = encoder.transform(testing['y_tst'])

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(45000, 202529)
(45000,)
(15000, 202529)
(15000,)


# Train model
Let's train our model using our training set!

In [14]:
from dask_ml.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train, y_train)
clf.predict(X_test)

AssertionError: length mismatch: 15000 vs. 202529

In [175]:
X_test
# clf.predict(X_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,202519,202520,202521,202522,202523,202524,202525,202526,202527,202528
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,14,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
