In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.metrics import roc_auc_score


### Loading data

In [23]:
# Comp 1
#train_data = pd.read_csv("C:/Users/johne/Downloads/playground-series-s4e7/train.csv")
#test_data = pd.read_csv("C:/Users/johne/Downloads/playground-series-s4e7/test.csv")

# Comp 2
train_data = pd.read_csv("C:/Users/matt/Downloads/train.csv/train.csv")
test_data = pd.read_csv("C:/Users/matt/Downloads/test.csv/test.csv")


# Display the first few rows of each dataframe
print(train_data.head())
print(test_data.head())

   id  Gender  Age  Driving_License  Region_Code  Previously_Insured  \
0   0    Male   21                1         35.0                   0   
1   1    Male   43                1         28.0                   0   
2   2  Female   25                1         14.0                   1   
3   3  Female   35                1          1.0                   0   
4   4  Female   36                1         15.0                   1   

  Vehicle_Age Vehicle_Damage  Annual_Premium  Policy_Sales_Channel  Vintage  \
0    1-2 Year            Yes         65101.0                 124.0      187   
1   > 2 Years            Yes         58911.0                  26.0      288   
2    < 1 Year             No         38043.0                 152.0      254   
3    1-2 Year            Yes          2630.0                 156.0       76   
4    1-2 Year             No         31951.0                 152.0      294   

   Response  
0         0  
1         1  
2         0  
3         0  
4         0  
        

In [24]:

# Split features and target
X = train_data.drop(['Response', 'id'], axis=1)
y = train_data['Response']
test_ids = test_data['id']
X_test = test_data.drop(['id'], axis=1)

# Identify numerical and categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [27]:
# Check the data types of Region_Code and Age
print(train_data[['Region_Code', 'Age']].dtypes)
print(test_data[['Region_Code', 'Age']].dtypes)

Region_Code    float64
Age              int64
dtype: object
Region_Code    float64
Age              int64
dtype: object


In [31]:
# Feature Engineering
# Function to process a chunk of data
def process_chunk(df):
    df['Region_Code'] = df['Region_Code'].astype(int).astype(str)
    df['Age'] = df['Age'].astype(str)
    df['Region_Code_Age'] = df['Region_Code'] + '_' + df['Age']
    df = pd.get_dummies(df, columns=['Region_Code_Age'], drop_first=True)
    return df

# Process train_data in chunks
chunk_size = 500  # Adjust chunk size based on your memory capacity
train_chunks = []
for chunk in pd.read_csv('C:/Users/matt/Downloads/train.csv/train.csv', chunksize=chunk_size):
    processed_chunk = process_chunk(chunk)
    train_chunks.append(processed_chunk)

train_data_processed = pd.concat(train_chunks, ignore_index=True)

# Process test_data in chunks
test_chunks = []
for chunk in pd.read_csv('C:/Users/matt/Downloads/test.csv/test.csv', chunksize=chunk_size):
    processed_chunk = process_chunk(chunk)
    test_chunks.append(processed_chunk)

test_data_processed = pd.concat(test_chunks, ignore_index=True)



ParserError: Error tokenizing data. C error: out of memory

### Preprocessing

In [None]:
# Preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])


### Defining Model

In [None]:
# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])


### Training 

In [None]:

# Split data for validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Train the model
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



### Testing

In [None]:
# Predict on validation set and evaluate
y_valid_pred = model.predict_proba(X_valid)[:, 1]
roc_auc = roc_auc_score(y_valid, y_valid_pred)
print(f'Validation ROC AUC: {roc_auc}')

Validation ROC AUC: 0.8782425125130968


In [None]:
# Predict on the test set
test_predictions = model.predict_proba(X_test)[:, 1]

### Preparing Submission

In [None]:
# Prepare the submission file
submission = pd.DataFrame({'id': test_ids, 'Response': test_predictions})
submission.to_csv('C:/Users/matt/Downloads/submission3.csv', index=False)
print('Submission file created successfully!')

Submission file created successfully!
