# Erdös Institute
## Data Science Boot Camp Project: Police Interaction
--- 

In [None]:
import pandas as pd
df = pd.read_csv('/Users/patrickma1994/Documents/data.tsv', sep='\t',dtype='str')
df.info()

In [None]:
# Checking if the value satisfy the codebook
print(df['V347'].value_counts())
list(df.columns)

By checking the codebook and 

In [None]:
df_m = df[['AGE', 'HISP', 'SEX', 'INCOME', 'V347']].rename(columns={'V347': 'proper_behave'})
df_m.head()

In [None]:
# Convert columns to categorical
df_m['AGE'] = pd.Categorical(df_m['AGE'])
df_m['HISP'] = pd.Categorical(df_m['HISP'])
df_m['SEX'] = pd.Categorical(df_m['SEX'])
df_m['INCOME'] = pd.Categorical(df_m['INCOME'])

# Filter rows where proper_behave is 1 or 2
df_m = df_m[df_m['proper_behave'].isin(['1', '2'])]

# Remap proper_behave values
df_m['proper_behave'] = df_m['proper_behave'].map({'1': 1, '2': 0})

# Convert proper_behave to numeric and checking the column number
df_m['proper_behave'] = pd.to_numeric(df_m['proper_behave'])
df_m.shape 
#df_m.head()
# Check missing values in all columns
print("Missing values in each column:")
print(df_m.isnull().sum())
# Seems to be no missing values to the current dataset

The following codes drops missing values, but our data doesn't have missing values for the features selected. 

In [None]:
# Drop rows with missing values in AGE, HISP, SEX, and INCOME
df_m = df_m.dropna(subset=['AGE', 'HISP', 'SEX', 'INCOME'])

# Verify the shape after dropping missing values
print("\nDataFrame shape after dropping missing values:")
print(df_m.shape)

# Verify there are no more missing values in these columns
print("\nRemaining missing values:")
print(df_m[['AGE', 'HISP', 'SEX', 'INCOME']].isnull().sum())

### Logistic regression result
---
In the following model setup, the y(dependent variable is "proper_behave"), the dependent variable is the following:
1. HISP: surveys if the the interviewee's ethnicity
2. INCOME: surveys the interviewee's income level
3. SEX: surveys the interviewee's gender
4. AGE: surveys the interviewee's age

The data set is splitted into 80% and 20%. A confusion matrix is provided below.

In [None]:
# Import required libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Create dummy variables for categorical predictors
X = pd.get_dummies(df_m[['AGE', 'HISP', 'SEX', 'INCOME']], drop_first=True)
y = df_m['proper_behave']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the logistic regression model
model_lr = LogisticRegression(random_state=42)
model_lr.fit(X_train, y_train)

print("Model accuracy:", model_lr.score(X_test, y_test))

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
# Get predictions on test set
y_pred = model_lr.predict(X_test)

# Create and plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No', 'Yes'])
disp.plot(cmap='Reds')
plt.title('Confusion Matrix')
plt.show()

# Print classification metrics
from sklearn.metrics import classification_report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Get model coefficients
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0]
})
print("\nModel coefficients:")
print(coef_df)

Logistic Regression: adding cross-validation for better result.

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Perform 5-fold cross-validation with multiple metrics
cv_scores = {
    'accuracy': cross_val_score(model_lr, X, y, cv=5, scoring='accuracy'),
    'precision': cross_val_score(model_lr, X, y, cv=5, scoring='precision_macro'),
    'recall': cross_val_score(model_lr, X, y, cv=5, scoring='recall_macro'),
    'f1': cross_val_score(model_lr, X, y, cv=5, scoring='f1_macro')
}

# Print results
for metric, scores in cv_scores.items():
    print(f"{metric.capitalize()} scores for each fold: {scores}")
    print(f"Mean {metric}: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
    print()
