In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'f1nalyze-datathon-ieeecsmuj:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F82253%2F8965849%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240629%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240629T033028Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2c500ad4efda94a8f9c0c78507d8ecee3d64e7326f9f6b9a7c0c0fe8a3b6b60d16ee4668bc43fbf967a6acf3e403f6ff7d6be4666a37224b74011e10006a306d3bbfec92babf35c3e9ad92ca337e50c7dbd30be71ef467720883570eb90876cf065aed4ea2b00b43154afbfca42c86a1bd5953a5fe767514261ec23ab75feb17930088f82195cbb1559cebf4b43934d84e3023041dce2620584b72e5a9d15826266a2a36c6ed8d2481841088b1c0c7eeed48d1b03b71e805647cdf3245eeb5b1bb04c9546016451fa8bb5fcfd6281f5d40fc65aaa24585ac1d5ed0caea539cae844830d5856697d1993598d1c0a64947c3be465384f36208528fd2cccbd6142f'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading f1nalyze-datathon-ieeecsmuj, 57575190 bytes compressed
Downloaded and uncompressed: f1nalyze-datathon-ieeecsmuj
Data source import complete.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/lib/kaggle/gcp.py
/kaggle/input/f1nalyze-datathon-ieeecsmuj/sample_submission.csv
/kaggle/input/f1nalyze-datathon-ieeecsmuj/validation.csv
/kaggle/input/f1nalyze-datathon-ieeecsmuj/train.csv
/kaggle/input/f1nalyze-datathon-ieeecsmuj/test.csv
/kaggle/working/submission.csv


In [None]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
df=pd.read_csv('/kaggle/input/f1nalyze-datathon-ieeecsmuj/train.csv',low_memory=False)
df=df.drop(columns=['fp1_date','fp1_time','fp2_time','fp2_date','fp3_time','fp3_date','quali_date','quali_time','sprint_date','sprint_time','url','url_x','url_y','positionText_y','dob','date','forename','surname','positionText_x','time_x','fastestLapTime','time_y','grand_prix'])
df.replace(r'\\N', np.nan, regex=True, inplace=True)

# Drop rows with NaN values
df= df.dropna()
label_encoder = LabelEncoder()
columns=['status','company','nationality_y','constructorRef','nationality','driverRef','driver_code']
for column in columns:
    df[column] = label_encoder.fit_transform(df[column])
for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = label_encoder.fit_transform(df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = label_encoder.fit_transform(df[column])


Unnamed: 0,resultId,racerId,driverId,constructorId,number,grid,position_x,positionOrder,points,laps,...,driver_code,nationality,driverStandingsId,raceId_y,position,constructorRef,company,nationality_y,status,result_driver_standing
2083470,1517,90,4,4,8,5,3,3,6.0,58,...,0,9,4,18,4,11,11,3,1,6068
2083471,1517,90,4,4,8,5,3,3,6.0,58,...,0,9,12,19,7,11,11,3,1,18204
2083472,1517,90,4,4,8,5,3,3,6.0,58,...,0,9,30,20,9,11,11,3,1,45510
2083473,1517,90,4,4,8,5,3,3,6.0,58,...,0,9,51,21,10,11,11,3,1,77367
2083474,1517,90,4,4,8,5,3,3,6.0,58,...,0,9,72,22,8,11,11,3,1,109224
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2829958,22108,899,20,9,1,1,1,1,25.0,71,...,19,5,71692,1092,11,10,10,0,1,1584966736
2829959,22108,899,20,9,1,1,1,1,25.0,71,...,19,5,71758,1093,11,10,10,0,1,1586425864
2829960,22108,899,20,9,1,1,1,1,25.0,71,...,19,5,71780,1094,11,10,10,0,1,1586912240
2829961,22108,899,20,9,1,1,1,1,25.0,71,...,19,5,71802,1095,11,10,10,0,1,1587398616


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler



scaler = RobustScaler()
ds = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
corr_matrix = ds.corr()

print("Correlation matrix:")
print(corr_matrix)
X=ds.drop(columns=['position'])
y=ds['position']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a regression model
model = GradientBoostingRegressor(n_estimators=100)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Correlation matrix:
                        resultId   racerId  driverId  constructorId    number  \
resultId                1.000000  0.895979  0.291918       0.304803 -0.050266   
racerId                 0.895979  1.000000  0.330241       0.341819 -0.068920   
driverId                0.291918  0.330241  1.000000      -0.063473  0.384292   
constructorId           0.304803  0.341819 -0.063473       1.000000  0.106289   
number                 -0.050266 -0.068920  0.384292       0.106289  1.000000   
grid                    0.127335  0.122389  0.395934       0.035832  0.329009   
position_x              0.198566  0.187323  0.452489       0.041461  0.350657   
positionOrder           0.198566  0.187323  0.452489       0.041461  0.350657   
points                  0.367202  0.328181 -0.262889       0.090627 -0.373463   
laps                   -0.158495 -0.126383 -0.098139      -0.044954 -0.063535   
timetaken_in_millisec   0.228905  0.207117  0.066969       0.052308 -0.007152   
fastestL

In [None]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler

df=pd.read_csv('/kaggle/input/f1nalyze-datathon-ieeecsmuj/validation.csv',low_memory=False)
df=df.drop(columns=['fp1_date','fp1_time','fp2_time','fp2_date','fp3_time','fp3_date','quali_date','quali_time','sprint_date','sprint_time','url','url_x','url_y','positionText_y','dob','date','forename','surname','positionText_x','time_x','fastestLapTime','time_y','grand_prix'])
df.replace(r'\\N', np.nan, regex=True, inplace=True)

# Drop rows with NaN values
df= df.dropna()
label_encoder = LabelEncoder()
columns=['status','company','nationality_y','constructorRef','nationality','driverRef','driver_code']
for column in columns:
    df[column] = label_encoder.fit_transform(df[column])
for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')
scaler = RobustScaler()
ds = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
corr_matrix = df.corr()

print("Correlation matrix:")
print(corr_matrix)
X=ds.drop(columns=['position'])
y=ds['position']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a regression model

model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = label_encoder.fit_transform(df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = label_encoder.fit_transform(df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = label_encoder.fit_transform(df[column])


Correlation matrix:
                        resultId   racerId  driverId  constructorId    number  \
resultId                1.000000  0.996533  0.131101       0.061600  0.061257   
racerId                 0.996533  1.000000  0.125478       0.066340  0.060616   
driverId                0.131101  0.125478  1.000000      -0.201543  0.293028   
constructorId           0.061600  0.066340 -0.201543       1.000000  0.258669   
number                  0.061257  0.060616  0.293028       0.258669  1.000000   
grid                   -0.047845 -0.055955  0.226556      -0.184008 -0.057729   
position_x             -0.031062 -0.041034  0.309604      -0.242396 -0.041918   
positionOrder          -0.031062 -0.041034  0.309604      -0.242396 -0.041918   
points                  0.029481  0.035832 -0.312589       0.344085  0.084382   
laps                   -0.006471 -0.002592 -0.033595       0.017200 -0.018128   
timetaken_in_millisec  -0.076416 -0.073411  0.060720      -0.024558  0.008539   
fastestL

In [None]:
from sklearn.impute import SimpleImputer
df1=pd.read_csv('/kaggle/input/f1nalyze-datathon-ieeecsmuj/test.csv',low_memory=False)
df1=df1.drop(columns=['fp1_date','fp1_time','fp2_time','fp2_date','fp3_time','fp3_date','quali_date','quali_time','sprint_date','sprint_time','url','url_x','url_y','dob','date','forename','surname','positionText_x','time_x','fastestLapTime','time_y','grand_prix'])
df1.replace(r'\\N', np.nan, regex=True, inplace=True)
df1.bfill(inplace=True)

label_encoder = LabelEncoder()
columns=['status','company','nationality_y','constructorRef','nationality','driverRef','driver_code']
for column in columns:
    df1[column] = label_encoder.fit_transform(df1[column])
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(df1)
for column in df1.columns:
    df1[column] = pd.to_numeric(df1[column], errors='coerce')



columns_with_nan = df1.columns[df1.isna().any()].tolist()

print("Columns with NaN values:")
print(columns_with_nan)
mean_values = df1[columns_with_nan].mean()
print(mean_values)
df1[columns_with_nan] = df1[columns_with_nan].fillna(mean_values)
columns_with_nan = df1.columns[df1.isna().any()].tolist()

print("Columns with NaN values:")
print(columns_with_nan)
print(df.dtypes)
scaler = RobustScaler()
ds = pd.DataFrame(scaler.fit_transform(df1), columns=df1.columns)



# Predict and evaluate
y_pred = model.predict(ds)
print(y_pred)


Columns with NaN values:
['position_x', 'timetaken_in_millisec', 'fastestLap', 'max_speed']
position_x               7.293331e+00
timetaken_in_millisec    5.741776e+06
fastestLap               4.695696e+01
max_speed                2.109942e+02
dtype: float64
Columns with NaN values:
[]
resultId                    int64
racerId                     int64
driverId                    int64
constructorId               int64
number                      int64
grid                        int64
position_x                float64
positionOrder               int64
points                    float64
laps                        int64
timetaken_in_millisec     float64
fastestLap                float64
rank                        int64
max_speed                 float64
statusId                    int64
year                        int64
round                       int64
circuitId                   int64
driverRef                   int64
driver_num                  int64
driver_code                 int64

In [None]:
df1['position']=y_pred
df1[['position','result_driver_standing']].to_csv('/kaggle/working/submission.csv')