# Logistic Regression

## Imports

In [2]:
import pandas as pd
# from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, classification_report

In [5]:
from tqdm import tqdm

## Load the data

In [9]:
# Import data from local machine
df = pd.read_csv('../../data/contract_classification.csv')

  df = pd.read_csv('../../data/contract_classification.csv')


In [2]:
# Mount Google Drive
drive.mount('/content/drive')

# Load the CSV file
file_path = '/content/drive/My Drive/Capstone/Data/contract_classification.csv'
df = pd.read_csv(file_path)

# Count all observations
num_observations = len(df)
print(f"Number of observations: {num_observations}")

# List all column names
column_names = df.columns
print("Column names:")
print(column_names)

# Display the types of the columns
column_types = df.dtypes
print("Column types:")
print(column_types)

Mounted at /content/drive


  df = pd.read_csv(file_path)


Number of observations: 11574439
Column names:
Index(['ANO_SID', 'CORPORATE_DEVISION', 'ORTPLZ', 'ORTS-NAME', 'STRASSE',
       'SUM_INSURED', 'CONSTRACTION_DESIGN', 'CONSTRUCTION_YEAR', 'WFL',
       'ZONE', 'SF-SYSTEM', 'TYPE_OF_DEDUCTIBLE', 'DRAIN_PIPE_INSURED',
       'PRODUCTLINE', 'PRIOR_DAMAGES', 'UVV-KZ', 'UNDERWRITER', 'PARTY-ID',
       'contract_year', 'PIPE_PREMIUM_AMOUNT', 'YEAR', 'DAMAGE'],
      dtype='object')
Column types:
ANO_SID                float64
CORPORATE_DEVISION      object
ORTPLZ                 float64
ORTS-NAME               object
STRASSE                 object
SUM_INSURED            float64
CONSTRACTION_DESIGN     object
CONSTRUCTION_YEAR      float64
WFL                    float64
ZONE                    object
SF-SYSTEM              float64
TYPE_OF_DEDUCTIBLE       int64
DRAIN_PIPE_INSURED       int64
PRODUCTLINE             object
PRIOR_DAMAGES            int64
UVV-KZ                   int64
UNDERWRITER             object
PARTY-ID                objec

## Preprocess the data

In [10]:
# Clean data that is not needed for the model
df = df.drop(['SF-SYSTEM', 'ANO_SID', 'ORTS-NAME', 'STRASSE', 'PARTY-ID'], axis=1)
df = df.dropna()

In [11]:
# Convert categorical columns to dummy variables
categorical_columns = ['CORPORATE_DEVISION', 'CONSTRACTION_DESIGN', 'ZONE', 'PRODUCTLINE', 'UNDERWRITER']
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Handle missing values by filling with 0
df = df.fillna(0)

# Define the features and target
X = df.drop(['DAMAGE', 'contract_year'], axis=1)
y = df['DAMAGE']

# Create a list of unique years in the data
years = sorted(df['contract_year'].unique())

## Rolling Window Estimation and Logistic Regression

In [12]:
# Initialize lists to store results
recall_scores = []

# Loop over each year for rolling window estimation
for i in tqdm(range(len(years) - 1)):
    train_year = years[i]
    test_year = years[i + 1]

    # Split data into training and testing sets based on year
    X_train = X[df['contract_year'] == train_year]
    y_train = y[df['contract_year'] == train_year]
    X_test = X[df['contract_year'] == test_year]
    y_test = y[df['contract_year'] == test_year]

    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Initialize the Logistic Regression model with class weights
    log_reg = LogisticRegression(class_weight='balanced', max_iter=1000)

    # Train the model on the training data
    log_reg.fit(X_train_scaled, y_train)

    # Make predictions on the test data
    y_pred = log_reg.predict(X_test_scaled)

    # Calculate recall score
    recall = recall_score(y_test, y_pred)
    recall_scores.append((test_year, recall))

    # Print the classification report for each year
    print(f"Classification report for year {test_year}:")
    print(classification_report(y_test, y_pred))

# Print recall scores
for year, score in recall_scores:
    print(f"Year: {year}, Recall Score: {score}")

  0%|          | 0/10 [00:00<?, ?it/s]

Classification report for year 2015-01-01:


 10%|█         | 1/10 [00:09<01:25,  9.45s/it]

              precision    recall  f1-score   support

           0       1.00      0.82      0.90    768070
           1       0.05      0.77      0.10      9606

    accuracy                           0.82    777676
   macro avg       0.52      0.79      0.50    777676
weighted avg       0.98      0.82      0.89    777676

Classification report for year 2016-01-01:


 20%|██        | 2/10 [00:19<01:17,  9.63s/it]

              precision    recall  f1-score   support

           0       1.00      0.79      0.88    797816
           1       0.05      0.79      0.09     10694

    accuracy                           0.79    808510
   macro avg       0.52      0.79      0.49    808510
weighted avg       0.98      0.79      0.87    808510

Classification report for year 2017-01-01:


 30%|███       | 3/10 [00:27<01:04,  9.15s/it]

              precision    recall  f1-score   support

           0       1.00      0.78      0.88    754974
           1       0.05      0.78      0.09     10393

    accuracy                           0.78    765367
   macro avg       0.52      0.78      0.48    765367
weighted avg       0.98      0.78      0.87    765367

Classification report for year 2018-01-01:


 40%|████      | 4/10 [00:37<00:55,  9.33s/it]

              precision    recall  f1-score   support

           0       1.00      0.77      0.87    756661
           1       0.05      0.77      0.09     11321

    accuracy                           0.77    767982
   macro avg       0.52      0.77      0.48    767982
weighted avg       0.98      0.77      0.86    767982

Classification report for year 2019-01-01:


 50%|█████     | 5/10 [00:45<00:44,  8.93s/it]

              precision    recall  f1-score   support

           0       1.00      0.76      0.86    762788
           1       0.05      0.79      0.09     12360

    accuracy                           0.76    775148
   macro avg       0.52      0.77      0.48    775148
weighted avg       0.98      0.76      0.85    775148

Classification report for year 2020-01-01:


 60%|██████    | 6/10 [00:54<00:35,  8.79s/it]

              precision    recall  f1-score   support

           0       1.00      0.75      0.85    773937
           1       0.06      0.80      0.10     14300

    accuracy                           0.75    788237
   macro avg       0.53      0.78      0.48    788237
weighted avg       0.98      0.75      0.84    788237

Classification report for year 2021-01-01:


 70%|███████   | 7/10 [01:03<00:27,  9.04s/it]

              precision    recall  f1-score   support

           0       0.99      0.73      0.84    780481
           1       0.06      0.82      0.11     15702

    accuracy                           0.73    796183
   macro avg       0.53      0.77      0.47    796183
weighted avg       0.98      0.73      0.83    796183

Classification report for year 2022-01-01:


 80%|████████  | 8/10 [01:12<00:18,  9.12s/it]

              precision    recall  f1-score   support

           0       1.00      0.71      0.83    791578
           1       0.05      0.83      0.09     13219

    accuracy                           0.72    804797
   macro avg       0.52      0.77      0.46    804797
weighted avg       0.98      0.72      0.82    804797

Classification report for year 2023-01-01:


 90%|█████████ | 9/10 [01:23<00:09,  9.58s/it]

              precision    recall  f1-score   support

           0       1.00      0.70      0.82    794432
           1       0.04      0.83      0.07     11113

    accuracy                           0.70    805545
   macro avg       0.52      0.77      0.45    805545
weighted avg       0.98      0.70      0.81    805545

Classification report for year 2024-01-01:


100%|██████████| 10/10 [01:33<00:00,  9.40s/it]

              precision    recall  f1-score   support

           0       1.00      0.68      0.81    799514
           1       0.01      0.86      0.01      2048

    accuracy                           0.68    801562
   macro avg       0.50      0.77      0.41    801562
weighted avg       1.00      0.68      0.81    801562

Year: 2015-01-01, Recall Score: 0.7660836976889445
Year: 2016-01-01, Recall Score: 0.7853936786983355
Year: 2017-01-01, Recall Score: 0.7793707302992399
Year: 2018-01-01, Recall Score: 0.7737832346965816
Year: 2019-01-01, Recall Score: 0.7912621359223301
Year: 2020-01-01, Recall Score: 0.8025874125874126
Year: 2021-01-01, Recall Score: 0.8160106992739778
Year: 2022-01-01, Recall Score: 0.8345563204478402
Year: 2023-01-01, Recall Score: 0.8346081166201745
Year: 2024-01-01, Recall Score: 0.857421875





## Summary of the Results

In [13]:
# Convert recall scores to DataFrame for better visualization
recall_df = pd.DataFrame(recall_scores, columns=['Year', 'Recall'])

# Display the recall scores
print("Recall scores over the years:")
print(recall_df)

Recall scores over the years:
         Year    Recall
0  2015-01-01  0.766084
1  2016-01-01  0.785394
2  2017-01-01  0.779371
3  2018-01-01  0.773783
4  2019-01-01  0.791262
5  2020-01-01  0.802587
6  2021-01-01  0.816011
7  2022-01-01  0.834556
8  2023-01-01  0.834608
9  2024-01-01  0.857422
