# MLP Project

## Data Loading

In [114]:
import numpy as np; import pandas as pd; pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt; import seaborn as sns

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

## EDA

## Preprocessing

In [133]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [116]:
def extract_location_data(s):
    street_number = direction = street_name = street_type = None
    try: # steet number
        nt = s.split(' ')[0]
        street_number = nt if int(nt.isdigit()) else None
    except: pass
    try: # direction
        nt = [x.strip() for x in s.split(' ') if x.strip()]
        for x in nt:
            if x.isdigit(): continue
            elif len(x)>1: break
            if x in ['N', 'S', 'E', 'W']:
                direction = x; break
    except: pass
    try: # street type
        nt = s.split('   ')[-1].strip()
        street_type = nt if len(nt)==2 and not(nt.isdigit()) else None
    except: pass

    street_name = None # street name
    nt = [x.strip() for x in s.split('  ') if x.strip()]
    for x in nt:
        test_x = [x.strip() for x in x.split(' ') if x.strip()]
        if len(test_x)==1 and not(street_number and street_number == test_x[0]) and not(street_type and street_type == test_x[0]):
            street_name = x; break
        elif not(street_number and street_number in test_x):
            street_name = x; break
        
    return street_number, direction, street_name, street_type
def split_location_column(df, location_col='Location'):
    print(df)
    parsed_data = []
    for idx, row in df.iterrows():
        string = row[location_col]
        street_number, direction, street_name, street_type = extract_location_data(string)
        parsed_data.append({
            # 'Location': string,
            'Street_Number': street_number,
            'Direction': direction,
            'Street_Name': street_name,
            'Street_Type': street_type
        })
    location_df = pd.DataFrame(parsed_data)
    # df = df.drop(columns=[location_col])
    # df = pd.concat([df, location_df], axis=1)
    return location_df

In [117]:
extractor = FunctionTransformer(split_location_column)
pipeline = Pipeline([
    ('extract', extractor),
    ('encode', OneHotEncoder())
])

# Define the column transformer
ct = ColumnTransformer([
    ('impute', SimpleImputer(strategy='constant', fill_value=''), ['Location']),
    ('extract_and_encode', pipeline, 'Location')
], remainder='passthrough')

In [118]:
result = ct.fit_transform(test_df)
pd.DataFrame(result)

0        1500    LEIGHTON                     AV
1         100 S  NORMANDIE                    AV
2         300 E  111TH                        ST
3        1300 S  LA BREA                      AV
4       11000    MORRISON                     ST
                          ...                   
4995     4600    MASCOT                       ST
4996     2200 E  7TH                          ST
4997             LANGDON                      AV
4998      400 E  5TH                          ST
4999    10100 S  SAN PEDRO                    ST
Name: Location, Length: 5000, dtype: object


AttributeError: 'Series' object has no attribute 'iterrows'

In [131]:
def date_extractor():
    return ''
def location_extractor():
    return ''

In [135]:
# Define the imputers
imputer = ColumnTransformer([
    ('cat', SimpleImputer(strategy='constant', fill_value='missing'), cat_columns),
    ('num', SimpleImputer(strategy='mean'), num_columns),
    ('ord', SimpleImputer(strategy='most_frequent'), ordinal_columns),
])

# Define the extractors
extractors = ColumnTransformer([
    ('location_extract', location_extractor, 'Location'),
    ('modus_extractor', TfidfVectorizer(), 'Modus_Operandi'),
    ('date', date_extractor, 'Time_Occurred')
])

# Define the pipeline for extraction and encoding
pipeline = Pipeline([
    ('imputer', imputer),
    ('extractors', FeatureUnion([
        ('location', Pipeline([
            ('extract', location_extractor),
            ('encode', OneHotEncoder(handle_unknown='ignore'))  # Encoding the extracted city names
        ])),
        ('modus_extractor', TfidfVectorizer(), 'Modus_Operandi'),
        ('date', date_extractor)
    ])),
    ('encoder', ColumnTransformer([
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns),
        ('ord', OneHotEncoder(handle_unknown='ignore'), ordinal_columns),
    ], remainder='passthrough'))
])
pipeline