### Libraries and data ingestion

In [1]:
from common_libraries import * 
import project_function


In [2]:

spark = SparkSession.builder \
    .master("local[*]") \
    .config("spark.sql.debug.catalog", False) \
    .config("spark.logLevel", "ERROR") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.driver.memory", "4g") \
    .config("spark.driver.cores", "2") \
    .getOrCreate()


spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/20 19:58:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Data Preprocessing

In [3]:
curated_df = spark.read.parquet("curated_data.parquet")

                                                                                

#### Binary Classification: Grouping related crime categories into broader categories (e.g., robbery crimes vs. non-robbery crimes) and train separate binary classifiers for each group.

In [4]:
curated_df = curated_df.withColumn('robbery_crime_type', when(col('crm_cd_desc') == 'ROBBERY', 1).otherwise(0))

In [5]:
curated_df = curated_df.drop('crm_cd_desc','crm_cd')

In [6]:
#curated_df.select(col('crm_cd_desc')).distinct().show(40,truncate=False)
#curated_df.groupBy('crm_cd_desc').count().orderBy(col('count').desc()).show(truncate=False)

#### Assessing the balance of the class instances (one class is significantly more prevalent than the other)

In [7]:
#curated_df.toPandas().stalking_crime_type.value_counts(normalize=True)

In [8]:
robbery_type_counts = curated_df.groupBy('robbery_crime_type').count()

# Calculate the proportion of each crime type
proportions = robbery_type_counts.withColumn('proportion', F.col('count') / curated_df.count())

proportions.show()

+------------------+------+--------------------+
|robbery_crime_type| count|          proportion|
+------------------+------+--------------------+
|                 1| 31521|0.034050252776217434|
|                 0|894199|  0.9659497472237826|
+------------------+------+--------------------+



#### Assessing categorical feature vict_sex

In [9]:

curated_df.select('vict_sex').distinct().show()
#count().orderBy('count', ascending=False).first()['vict_sex']
#df.groupBy('vict_descent').count().orderBy('count', ascending=False).first()['vict_descent']


+--------+
|vict_sex|
+--------+
|       F|
| unknown|
|       M|
|       X|
|       H|
+--------+



#### Convert categorical features with one-hot encoding 

In [10]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
# StringIndexer to convert the 'vict_sex' column into numerical indices
stringIndexer = StringIndexer(inputCol="vict_sex", outputCol="vict_sex_type_indexed")

# OneHotEncoder to encode the numerical indices into one-hot encoded vectors
encoder = OneHotEncoder(inputCol="vict_sex_type_indexed", outputCol="vict_sex_type_encoded")

# Define a pipeline that includes both StringIndexer and OneHotEncoder
pipeline = Pipeline(stages=[stringIndexer, encoder])

# Fit the pipeline to the DataFrame and transform the DataFrame
curated_df_encoded = pipeline.fit(curated_df).transform(curated_df)



In [11]:
curated_df_encoded = curated_df_encoded.drop('vict_sex')


#### Subsetting datasets with features of interest.

In [12]:
data_model = curated_df_encoded.select('vict_age','crime_day_occ', 
                                 'crime_month_occ', 'crime_year_occ', 'crime_day_rptd', 
                                 'crime_month_rptd', 'crime_year_rptd', 
                                 'robbery_crime_type','vict_sex_type_indexed')

In [13]:
#categorical_features = [curated_df.dtypes[value][0] for value in range(0,len(curated_df.columns)) if curated_df.dtypes[value][1]=='string']

In [14]:
#numerical_features = [curated_df.dtypes[value][0] for value in range(0,len(curated_df.columns)) if curated_df.dtypes[value][1]=='int' or curated_df.dtypes[value][1]=='double']


### Balance Class Distribution using SMOTE.

#### SMOTE (Synthetic Minority Over-sampling Technique) is a technique used to balance class distributions by generating synthetic samples of the minority class. It works by creating new instances that are similar to existing minority class instances. This helps address imbalances in the dataset and improves the performance of machine learning models, especially those sensitive to class imbalance.

In [15]:
# Convert to Pandas DataFrame
train_data_pd = data_model.toPandas()

# Apply oversampling 
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(train_data_pd.drop('robbery_crime_type', axis=1), train_data_pd['robbery_crime_type'])

# Combine resampled features and target variable
resampled_df = pd.DataFrame(X_resampled, columns=train_data_pd.drop('robbery_crime_type', axis=1).columns)
resampled_df['robbery_crime_type'] = y_resampled

# Convert back to PySpark DataFrame
train_data_balanced = spark.createDataFrame(resampled_df).repartition(8)


                                                                                

#### Creating a Vector Assembler which merges multiple columns into a single vector column. It's commonly used to assemble feature vectors for machine learning models in PySpark.

In [16]:
# Define the feature vector assembler
assembler = VectorAssembler(inputCols=['vict_age', 'crime_day_occ', 'crime_month_occ', 'crime_year_occ', 
                     
                                       'crime_day_rptd', 'crime_month_rptd', 'crime_year_rptd', 
                                       'vict_sex_type_indexed'], outputCol='features')

# Assemble features
data_assembled = assembler.transform(train_data_balanced)

# Split the data into training and testing sets
train_data, test_data = data_assembled.randomSplit([0.8, 0.2], seed=42)

# Scale features
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')



#### Now that the data preprocessing is complete, we fit the data into a logistic regression model to classify crimes as either robbery or non-robbery based on various features, evaluates its performance, and prints the accuracy of the model's predictions.

In [17]:
# Define Logistic Regression model
lr = LogisticRegression(featuresCol='scaled_features', labelCol='robbery_crime_type')

# Create pipeline
pipeline = Pipeline(stages=[scaler, lr])

# Fit the pipeline
model = pipeline.fit(train_data)

# Make predictions
predictions = model.transform(test_data)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol='robbery_crime_type')
auc_roc= evaluator.evaluate(predictions)

print("AUC-ROC:", auc_roc)

                                                                                

AUC-ROC: 0.5526471460967626


In [18]:

# Convert DataFrame to an RDD of (prediction, label) tuples
predictionAndLabels = predictions.select('prediction', 'robbery_crime_type').rdd.map(lambda row: (float(row['prediction']), float(row['robbery_crime_type'])))

# Initialize MulticlassMetrics
metrics = MulticlassMetrics(predictionAndLabels)

# Get the Confusion Matrix
confusionMatrix = metrics.confusionMatrix().toArray()
print("Confusion Matrix:\n", confusionMatrix)

# Calculate Precision, Recall, and F1 Score
precision = metrics.precision(1.0)
recall = metrics.recall(1.0)
f1Score = metrics.fMeasure(1.0)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1Score)


[Stage 113:>                                                        (0 + 8) / 8]

Confusion Matrix:
 [[ 83613.  95252.]
 [ 68628. 110114.]]
Precision: 0.5361841784910842
Recall: 0.6160499490886305
F1 Score: 0.5733491622147937


                                                                                

#### Let's interpret each of these metrics:

### AUC-ROC: 0.55264631211408
- AUC-ROC (Area Under the Receiver Operating Characteristic curve)** measures the ability of a classifier to distinguish between classes and is used as a summary of the model's performance. The ROC curve is a graphical representation of the trade-off between the true positive rate (TPR, recall) and the false positive rate (FPR) across different thresholds.
- An AUC-ROC value of 0.5526 is slightly better than random guessing, which has an AUC-ROC of 0.5. However, it's still considered not very good, indicating that the model does not do a great job at distinguishing between the positive and negative classes. Typically, an AUC-ROC closer to 1.0 indicates excellent model performance, while closer to 0.5 suggests no discriminative ability.

### Confusion Matrix
- The confusion matrix is a 2x2 table that shows the counts of correct and incorrect predictions classified by the actual classes:
  ```
  [ True Negatives  (TN) | False Positives (FP) ]
  [ False Negatives (FN) | True Positives  (TP) ]
  ```
- In the confusion matrix:
  - **True Negatives (TN)**: 83,613 - The number of negative instances correctly classified as negative.
  - **False Positives (FP)**: 95,252 - The number of negative instances incorrectly classified as positive.
  - **False Negatives (FN)**: 68,628 - The number of positive instances incorrectly classified as negative.
  - **True Positives (TP)**: 110,114 - The number of positive instances correctly classified as positive.

### Precision: 0.5361841784910842
- Precision measures the accuracy of positive predictions. It is defined as the ratio of true positives to the sum of true and false positives.
- A precision of 0.5362 indicates that approximately 53.62% of the model’s positive classifications are correct, suggesting that when the model predicts an instance as positive, it is correct about half the time.

### Recall: 0.6160499490886305
- Recall (or Sensitivity or True Positive Rate) measures the ability of a model to find all the relevant cases (all true positives).
- A recall of 0.6160 means that the model correctly identifies about 61.60% of the actual positive cases. Thus, the model is moderately effective at capturing positive instances but still misses around 38.40% of them.

### F1 Score: 0.5733491622147937
- F1 Score is the harmonic mean of precision and recall. It is a balance between precision and recall, providing a single score that balances both the false positives and false negatives.
- An F1 score of 0.5733 suggests a moderate balance between precision and recall, which is not particularly high, indicating that the model is not very effective in terms of precision-recall trade-off.

### Overall Interpretation
- The model shows limited effectiveness in discriminating between the positive and negative classes as indicated by the AUC-ROC score.
- Although it has a moderate recall, its precision is also moderate, leading to a moderate F1 score. This could be indicative of a need to revisit feature selection, model choice, or threshold settings.
- The high number of false positives and false negatives suggests potential issues with the model’s ability to generalize or possibly imbalanced class distribution in the data. Consider exploring model improvements or trying different classification algorithms.
- Possibly adopt specific data preprocessing, feature engineering, or trying different threshold settings for classification decisions to potentially improve these metrics.