In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()
spark

In [3]:
df_kids = spark.read.parquet('../data/kickstarter.parquet')
print(f'Number of lines: {df_kids.count()}')

Number of lines: 107615


In [4]:
df_kids.printSchema()

root
 |-- project_id: string (nullable = true)
 |-- goal: double (nullable = true)
 |-- final_status: integer (nullable = true)
 |-- country_clean: string (nullable = true)
 |-- currency_clean: string (nullable = true)
 |-- deadline_clean: date (nullable = true)
 |-- created_at_clean: date (nullable = true)
 |-- launched_at_clean: date (nullable = true)
 |-- days_campaign: integer (nullable = true)
 |-- hours_prepa: double (nullable = true)
 |-- text: string (nullable = true)



In [5]:
df_kids.select('text').show()

+--------------------+
|                text|
+--------------------+
|where demons danc...|
|i want to bring l...|
|whole, an art exh...|
|make music and me...|
|once you know...a...|
|modular desktop c...|
|rolling freight b...|
|"theodore wants t...|
|afterglow radio w...|
|the best of betty...|
|"""""""""""""""""...|
|rockers with love...|
|help support - je...|
|prepared televisi...|
|there it was ther...|
|building the open...|
|anima - the film....|
|"flatbed honeymoo...|
|stuffer the story...|
|participate in a ...|
+--------------------+
only showing top 20 rows



In [6]:
features = ['days_campaign', 'hours_prepa', 'country_clean', 'currency_clean']
label = 'final_status'

df = df_kids.select(features + [label])
df.show()

+-------------+-----------+-------------+--------------+------------+
|days_campaign|hours_prepa|country_clean|currency_clean|final_status|
+-------------+-----------+-------------+--------------+------------+
|           90|        1.1|           US|           USD|           0|
|           29|      41.62|           US|           USD|           1|
|           28|       18.7|           US|           USD|           0|
|           47|      75.69|           US|           USD|           0|
|           45|     457.74|           US|           USD|           0|
|           45|     380.94|           US|           USD|           1|
|           46|     814.58|           US|           USD|           1|
|           61|      58.93|           US|           USD|           0|
|           40|       2.16|           US|           USD|           1|
|           44|     191.41|           US|           USD|           0|
|           30|      43.21|           US|           USD|           0|
|           60|     

In [7]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [8]:
def build_string_indexer(col_name: str) -> StringIndexer:
    return StringIndexer().setInputCol(col_name).setOutputCol(f'{col_name}_indexed').setHandleInvalid("keep")

def build_one_hot_encoder(col_name: str) -> OneHotEncoder:
    return OneHotEncoder().setInputCol(col_name).setOutputCol(f'{col_name}_encoded')

categorical_columns = ['days_campaign', 'hours_prepa', 'country_clean', 'currency_clean']

indexing_stages = [build_string_indexer(c) for c in categorical_columns]
indexed_columns = [s.getOutputCol() for s in indexing_stages]
encoding_stages = [build_one_hot_encoder(c) for c in indexed_columns]

In [9]:
vector_assembler = VectorAssembler()\
    .setInputCols([s.getOutputCol() for s in encoding_stages])\
    .setOutputCol('features')

In [10]:
lr = LogisticRegression()\
    .setMaxIter(3)\
    .setFeaturesCol(vector_assembler.getOutputCol())\
    .setLabelCol('final_status')
    
model_specs = Pipeline().setStages(indexing_stages + encoding_stages + [vector_assembler] + [lr])

In [11]:
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import mlflow
from mlflow.spark import log_model

df_train, df_test = df.randomSplit([0.8, 0.2], seed=12345)
df_train = df_train.cache()

evaluator = BinaryClassificationEvaluator()\
    .setMetricName('areaUnderROC')\
    .setRawPredictionCol('rawPrediction')\
    .setLabelCol('final_status')

mlflow_tracking_ui = 'http://35.246.84.226'
mlflow_experiment_name = 'kickstarter'

mlflow.set_tracking_uri(mlflow_tracking_ui)
mlflow.set_experiment(experiment_name=mlflow_experiment_name)

with mlflow.start_run() as active_run:
    print(f'Fitting model on {df_train.count()} lines')

    model = model_specs.fit(df_train)

    print('Evaluating model')
    train_metrics = evaluator.evaluate(model.transform(df_train))
    metrics = {'train_auc': train_metrics}

    test_metrics = evaluator.evaluate(model.transform(df_test))
    metrics.update({'test_auc': test_metrics})
    print(metrics)
    
    print('Logging to mlflow')
    mlflow.log_params({'model_class': 'logistic regression', 'lr_max_iter': lr.getMaxIter()})
    mlflow.log_metrics(metrics)
    log_model(model, 'model')

Fitting model on 86092 lines
Evaluating model
{'train_auc': 0.9658356041416117, 'test_auc': 0.5431559786851168}
Logging to mlflow
