<a href="https://colab.research.google.com/github/mushir2004/train-and-test/blob/main/train_and_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load your data
try:
    data = pd.read_excel('/content/AGING TYRE RFID_2.xlsx')
except Exception as e:
    print(f"Failed to load data: {e}")
    data = pd.DataFrame()  # Set to empty DataFrame if loading fails

# Inspect column names and preview data
if not data.empty:
    print("Column names:", data.columns)
    print("First few rows of data:\n", data.head())

    # Check for missing values in the raw data
    print("Missing values in the dataset:\n", data.isna().sum())

    # Define target variable and features using correct column names
    target_variable = 'Unnamed: 5'  # Update this to the correct target column name if necessary
    excluded_feature = 'Unnamed: 3' # Update this if there are columns to exclude
    features_columns = [col for col in data.columns if col != excluded_feature and col != target_variable]

    # Ensure that the target variable and feature columns are correct
    if target_variable not in data.columns:
        print(f"'{target_variable}' not found in columns. Available columns: {data.columns}")
        target_variable = None  # Set to None if target column is not found

    if target_variable:
        # Define features and target
        try:
            features = data[features_columns]
            target = data[target_variable]
        except KeyError as e:
            print(f"Column error: {e}")
            features = pd.DataFrame()
            target = pd.Series()

        if not features.empty and not target.empty:
            # Check initial shapes and missing values
            print("Initial features shape:", features.shape)
            print("Initial target shape:", target.shape)
            print("Features with NaNs:\n", features.isna().sum())
            print("Target with NaNs:\n", target.isna().sum())

            # Convert non-numeric columns to numeric, if necessary
            features = features.apply(pd.to_numeric, errors='coerce')

            # Check the effect of conversion
            print("Features after conversion:\n", features.head())
            print("Features with NaNs after conversion:\n", features.isna().sum())

            # Fill NaNs with the median or mean value
            features_filled = features.fillna(features.median())
            print("Features shape after filling NaNs:", features_filled.shape)

            # Concatenate features and target to drop rows with NaNs in both
            data_cleaned = pd.concat([features_filled, target], axis=1).dropna()
            print(f"Shape of cleaned data: {data_cleaned.shape}")

            # Proceed with splitting the data into training and testing sets
            features_cleaned = data_cleaned.drop(columns=[target_variable])
            target_cleaned = data_cleaned[target_variable]

            if len(features_cleaned) >= 2:
                X_train, X_test, y_train, y_test = train_test_split(features_cleaned, target_cleaned, test_size=0.2, random_state=42)

                # Initialize the model
                model = RandomForestRegressor()

                # Train the model
                model.fit(X_train, y_train)

                # Make predictions on the test set
                y_pred = model.predict(X_test)

                # Evaluate the model
                mse = mean_squared_error(y_test, y_pred)
                print(f"Mean Squared Error: {mse}")

                # Optionally, print a comparison of some actual vs. predicted values
                results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
                print(results.head())
            else:
                print("Not enough samples to split into training and testing sets.")
        else:
            print("Features or target data is empty. Check your data.")
    else:
        print("Target variable is not correctly defined.")
else:
    print("Data is empty or not loaded properly.")


Column names: Index(['Tire Size:', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17',
       'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21',
       'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25',
       'Unnamed: 26', 'Unnamed: 27'],
      dtype='object')
First few rows of data:
             Tire Size: Unnamed: 1 Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5  \
0                  NaN        NaN        NaN       Temp      Dist.       Rad.   
1                 Date       Time       Step       (ºC)       (km)       (cm)   
2  2022-03-07 00:00:00   13:09:01          1       30.1          0       49.9   
3  2022-03-07 00:00:00   13:10:01          1         29          0       49.9   
4  2022-03-07 00:00:00   13:11:01          1       29.1          0       4