# Logistic Regression

## Imports

In [1]:
import pandas as pd
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, classification_report

## Load the data

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

# Load the CSV file
file_path = '/content/drive/My Drive/Capstone/Data/contract_classification.csv'
df = pd.read_csv(file_path)

# Count all observations
num_observations = len(df)
print(f"Number of observations: {num_observations}")

# List all column names
column_names = df.columns
print("Column names:")
print(column_names)

# Display the types of the columns
column_types = df.dtypes
print("Column types:")
print(column_types)

Mounted at /content/drive


  df = pd.read_csv(file_path)


Number of observations: 11574439
Column names:
Index(['ANO_SID', 'CORPORATE_DEVISION', 'ORTPLZ', 'ORTS-NAME', 'STRASSE',
       'SUM_INSURED', 'CONSTRACTION_DESIGN', 'CONSTRUCTION_YEAR', 'WFL',
       'ZONE', 'SF-SYSTEM', 'TYPE_OF_DEDUCTIBLE', 'DRAIN_PIPE_INSURED',
       'PRODUCTLINE', 'PRIOR_DAMAGES', 'UVV-KZ', 'UNDERWRITER', 'PARTY-ID',
       'contract_year', 'PIPE_PREMIUM_AMOUNT', 'YEAR', 'DAMAGE'],
      dtype='object')
Column types:
ANO_SID                float64
CORPORATE_DEVISION      object
ORTPLZ                 float64
ORTS-NAME               object
STRASSE                 object
SUM_INSURED            float64
CONSTRACTION_DESIGN     object
CONSTRUCTION_YEAR      float64
WFL                    float64
ZONE                    object
SF-SYSTEM              float64
TYPE_OF_DEDUCTIBLE       int64
DRAIN_PIPE_INSURED       int64
PRODUCTLINE             object
PRIOR_DAMAGES            int64
UVV-KZ                   int64
UNDERWRITER             object
PARTY-ID                objec

## Preprocess the data

In [None]:
# Convert categorical columns to dummy variables
categorical_columns = ['CORPORATE_DEVISION', 'ORTS-NAME', 'STRASSE', 'CONSTRACTION_DESIGN', 'ZONE', 'PRODUCTLINE', 'UNDERWRITER', 'PARTY-ID']
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Handle missing values by filling with 0
df = df.fillna(0)

# Define the features and target
X = df.drop(['DAMAGE', 'contract_year'], axis=1)
y = df['DAMAGE']

# Create a list of unique years in the data
years = sorted(df['contract_year'].unique())

## Rolling Window Estimation and Logistic Regression

In [None]:
# Initialize lists to store results
recall_scores = []

# Loop over each year for rolling window estimation
for i in range(len(years) - 1):
    train_year = years[i]
    test_year = years[i + 1]

    # Split data into training and testing sets based on year
    X_train = X[df['contract_year'] == train_year]
    y_train = y[df['contract_year'] == train_year]
    X_test = X[df['contract_year'] == test_year]
    y_test = y[df['contract_year'] == test_year]

    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Initialize the Logistic Regression model with class weights
    log_reg = LogisticRegression(class_weight='balanced', max_iter=1000)

    # Train the model on the training data
    log_reg.fit(X_train_scaled, y_train)

    # Make predictions on the test data
    y_pred = log_reg.predict(X_test_scaled)

    # Calculate recall score
    recall = recall_score(y_test, y_pred)
    recall_scores.append((test_year, recall))

    # Print the classification report for each year
    print(f"Classification report for year {test_year}:")
    print(classification_report(y_test, y_pred))

# Print recall scores
for year, score in recall_scores:
    print(f"Year: {year}, Recall Score: {score}")

## Summary of the Results

In [None]:
# Convert recall scores to DataFrame for better visualization
recall_df = pd.DataFrame(recall_scores, columns=['Year', 'Recall'])

# Display the recall scores
print("Recall scores over the years:")
print(recall_df)