In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the data
data = pd.read_csv('/content/1000_Companies.csv')

# Display the dataset
print("Original Dataset:")
print(data.head())



Original Dataset:
   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94


In [None]:
# Step 1: Handle missing values (if any)
print("\nMissing values in the dataset:")
print(data.isnull().sum())



Missing values in the dataset:
R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64


In [None]:
# Step 2: Encode categorical variables (State)
# Use OneHotEncoder to convert the 'State' column into numerical data
column_transformer = ColumnTransformer(
    transformers=[
        ('one_hot', OneHotEncoder(), ['State'])  # Apply OneHotEncoder to the 'State' column
    ],
    remainder='passthrough'  # Keep the other columns as they are
)


In [None]:
# Step 3: Prepare the data for modeling
X = data[['R&D Spend', 'Administration', 'Marketing Spend', 'State']]  # Features
y = data['Profit']  # Target


In [None]:
# Step 4: Create a pipeline for preprocessing and modeling
model = Pipeline(steps=[
    ('preprocessor', column_transformer),  # Preprocess the data
    ('regressor', LinearRegression())  # Train a Linear Regression model
])


In [None]:
# Step 5: Train the model
model.fit(X, y)

# Step 6: Predict the profit for the given input
new_data = pd.DataFrame({
    'R&D Spend': [91694.48],
    'Administration': [515841.3],
    'Marketing Spend': [11931.24],
    'State': ['Florida']
})

predicted_profit = model.predict(new_data)

print("\nPredicted Profit for the given input:")
print(predicted_profit[0])


Predicted Profit for the given input:
510570.99261086184
