In [97]:
import dask.dataframe as dd
import dask.bag as db
from os import path
import pandas as pd
import csv
import multiprocessing
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from dask_ml.model_selection import train_test_split
# from sklearn.pipeline import make_pipeline

filename_train = "../dataset/train.csv"
filename_test = "../dataset/valid.csv"
NUMBER_OF_CPU = multiprocessing.cpu_count()

# Load data in Dask Dataframe

In [3]:
panda_train = pd.read_csv(filename_train)[0:20]
panda_test = pd.read_csv(filename_test)

train_df = dd.from_pandas(panda_train, npartitions=1)
test_tf = dd.from_pandas(panda_test, npartitions=NUMBER_OF_CPU)

Below, we can see that the dataframe is succesfully imported into a Dask dataframe. Now, we need to extract the information we need & build a trainig & testing Dataframe that we will use for the later stages. 

**We are setting the number of partitions relative to the number of CPUs available in your machine.**

In [6]:
print(f'Number of partitions: {NUMBER_OF_CPU}')
train_df.head()

Number of partitions: 6


Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552656,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,<java><repeat>,2016-01-01 00:21:59,LQ_CLOSE
1,34553034,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,<java><optional>,2016-01-01 02:03:20,HQ
2,34553174,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,<javascript><image><overlay><react-native><opa...,2016-01-01 02:48:24,HQ
3,34553318,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...",<swift><operators><whitespace><ternary-operato...,2016-01-01 03:30:17,HQ
4,34553755,hide/show fab with scale animation,<p>I'm using custom floatingactionmenu. I need...,<android><material-design><floating-action-but...,2016-01-01 05:21:48,HQ


In [72]:
def get_question(partition):
    title = partition.Title
    body = partition.Body
    return title + " " + body

def get_quality(partition):
    return partition.Y

## Initialize training & testing dask dataframe
train_df["X_trn"] = train_df.map_partitions(get_question, meta=str)
train_df["y_trn"] = train_df.map_partitions(get_quality, meta=str)
test_tf["X_trn"] = test_tf.map_partitions(get_question, meta=str)
test_tf["y_trn"] = test_tf.map_partitions(get_quality, meta=str)

training = train_df.drop(['Id', 'Title', 'Body', 'CreationDate', 'Y', 'Tags'], axis=1)
testing = test_tf.drop(['Id', 'Title', 'Body', 'CreationDate', 'Y', 'Tags'], axis=1)

Below, you can see our training Dask Dataframe that we created by concatenating the Title & Body and their corresponding output. 

In [8]:
print(training.head())
print(f'\n{type(training)}')
print(training.dtypes)

                                               X_trn     y_trn
0  Java: Repeat Task Every Random Seconds <p>I'm ...  LQ_CLOSE
1  Why are Java Optionals immutable? <p>I'd like ...        HQ
2  Text Overlay Image with Darkened Opacity React...        HQ
3  Why ternary operator in swift is so picky? <p>...        HQ
4  hide/show fab with scale animation <p>I'm usin...        HQ

<class 'dask.dataframe.core.DataFrame'>
X_trn    object
y_trn    object
dtype: object


# Preprocess Data
In this part, we will preprocess our data by cleaning the text. Then, we will build a bag of word model.

### Cleaning data

In [73]:
STOPWORDS = ["why", "the", "i"] 

if isinstance(training.head().loc[1, 'X_trn'], str):
    training["X_trn"] = training["X_trn"].str.lower()
    training["X_trn"] = training["X_trn"].replace(to_replace="(\\W)+", value=' ', regex=True)
    training['X_trn'] = training['X_trn'].apply(lambda x: [token for token in x.split(" ")], meta=str)
    training['X_trn'] = training['X_trn'].apply(lambda x: [token for token in x if token not in STOPWORDS], meta=str)
    training['X_trn'] = training['X_trn'].apply(lambda x: [token for token in x if token], meta=str)
    training['X_trn'] = training['X_trn'].apply(lambda x: " ".join(x), meta=str)

In [74]:
print(training.head().loc[0, 'X_trn'])
# training.head()

java repeat task every random seconds p m already familiar with repeating tasks every n seconds by using java util timer and java util timertask but lets say want to print hello world to console every random seconds from 1 5 unfortunately m in a bit of a rush and don t have any code to show so far any help would be apriciated p


### Create BoW model

In [100]:
# Build BoW model
vectorizer = CountVectorizer()
encoder = preprocessing.LabelEncoder()

X_trn_countMatrix = pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(training['X_trn']))
y_trn_encoder = encoder.fit_transform(training['y_trn'])

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X_trn_countMatrix, y_trn_encoder,
                                                    random_state=0, convert_mixed_types=True)