In [7]:
import pandas as pd
import numpy as np
df=pd.read_csv("/content/company.csv")
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [10]:
numeric_columns=df.select_dtypes(include=np.number)
numeric_columns.head()


Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
0,165349.2,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


In [13]:
categorical_columns=df.select_dtypes(exclude=np.number)
categorical_columns.head()

Unnamed: 0,State
0,New York
1,California
2,Florida
3,New York
4,Florida


In [16]:
df.drop_duplicates()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.20,136897.800,471784.1000,New York,192261.83000
1,162597.70,151377.590,443898.5300,California,191792.06000
2,153441.51,101145.550,407934.5400,Florida,191050.39000
3,144372.41,118671.850,383199.6200,New York,182901.99000
4,142107.34,91391.770,366168.4200,Florida,166187.94000
...,...,...,...,...,...
995,54135.00,118451.999,173232.6695,California,95279.96251
996,134970.00,130390.080,329204.0228,California,164336.60550
997,100275.47,241926.310,227142.8200,California,413956.48000
998,128456.23,321652.140,281692.3200,California,333962.19000


In [15]:
zscore_outliers = {}
for column in numeric_columns.columns:
    mean_val = numeric_columns[column].mean()
    std_dev_val = numeric_columns[column].std()
    if std_dev_val == 0: # Avoid division by zero if all values are the same
        continue
    z_scores = (numeric_columns[column] - mean_val) / std_dev_val
    outliers = numeric_columns[column][np.abs(z_scores) > 3].tolist()
    if outliers:
        zscore_outliers[column] = outliers

print("Outliers detected using Z-score method:")
print(zscore_outliers)

Outliers detected using Z-score method:
{'Administration': [182645.56, 85047.44, 51283.14, 65947.93, 82982.09, 84710.77, 51743.15, 241926.31, 321652.14, 270939.86], 'Profit': [413956.48, 333962.19, 476485.43]}


In [18]:
iqr_outliers = {}
for column in numeric_columns.columns:
    Q1 = numeric_columns[column].quantile(0.25)
    Q3 = numeric_columns[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = numeric_columns[column][(numeric_columns[column] < lower_bound) | (numeric_columns[column] > upper_bound)].tolist()
    if outliers:
        iqr_outliers[column] = outliers

print("Outliers detected using IQR method:")
print(iqr_outliers)

Outliers detected using IQR method:
{'Administration': [151377.59, 91391.77, 148718.95, 91790.61, 156547.42, 153514.11, 153773.43, 182645.56, 153032.06, 152701.92, 157693.92, 85047.44, 51283.14, 65947.93, 82982.09, 84710.77, 96189.63, 154806.14, 51743.15, 241926.31, 321652.14, 270939.86], 'Profit': [413956.48, 333962.19, 476485.43]}


In [19]:
null_counts = df.isnull().sum()
print("Null values in each column:")
print(null_counts)

Null values in each column:
R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64


In [26]:
for column in numeric_columns.columns:
    if df[column].isnull().any(): # Check if there are any null values to impute
        mean_value = df[column].mean()
        df[column].fillna(mean_value, inplace=True)
        print(f"Null values in column '{column}' filled with mean: {mean_value}")
    else:
        print(f"No null values found in column '{column}'. Skipping imputation.")


No null values found in column 'R&D Spend'. Skipping imputation.
No null values found in column 'Administration'. Skipping imputation.
No null values found in column 'Marketing Spend'. Skipping imputation.
No null values found in column 'Profit'. Skipping imputation.


In [28]:
print("Columns in DataFrame:", df.columns)
print("Data type of 'State' column:", df['State'].dtype)

Columns in DataFrame: Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')
Data type of 'State' column: object


In [29]:
df = pd.get_dummies(df, columns=['State'], drop_first=False)
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,False,False,True
1,162597.7,151377.59,443898.53,191792.06,True,False,False
2,153441.51,101145.55,407934.54,191050.39,False,True,False
3,144372.41,118671.85,383199.62,182901.99,False,False,True
4,142107.34,91391.77,366168.42,166187.94,False,True,False


In [30]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,False,False,True
1,162597.7,151377.59,443898.53,191792.06,True,False,False
2,153441.51,101145.55,407934.54,191050.39,False,True,False
3,144372.41,118671.85,383199.62,182901.99,False,False,True
4,142107.34,91391.77,366168.42,166187.94,False,True,False


In [31]:
import numpy as np

columns_to_transform = ['R&D Spend', 'Administration', 'Marketing Spend', 'Profit']

for column in columns_to_transform:
    df[column] = np.log1p(df[column])

print("DataFrame after logarithmic transformation (first 5 rows):")
df.head()

DataFrame after logarithmic transformation (first 5 rows):


Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
0,12.015821,11.826997,13.064279,12.166619,False,False,True
1,11.99904,11.927539,13.003354,12.164172,True,False,False
2,11.941081,11.524326,12.918864,12.160298,False,True,False
3,11.880158,11.684126,12.856314,12.116711,False,False,True
4,11.864345,11.422922,12.810851,12.020881,False,True,False


In [33]:
X = df.drop('Profit', axis=1)
y = df['Profit']

print("Features (X) head:")
print(X.head())
print("\nTarget variable (y) head:")
print(y.head())

Features (X) head:
   R&D Spend  Administration  Marketing Spend  State_California  \
0  12.015821       11.826997        13.064279             False   
1  11.999040       11.927539        13.003354              True   
2  11.941081       11.524326        12.918864             False   
3  11.880158       11.684126        12.856314             False   
4  11.864345       11.422922        12.810851             False   

   State_Florida  State_New York  
0          False            True  
1          False           False  
2           True           False  
3          False            True  
4           True           False  

Target variable (y) head:
0    12.166619
1    12.164172
2    12.160298
3    12.116711
4    12.020881
Name: Profit, dtype: float64


In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (800, 6)
Shape of X_test: (200, 6)
Shape of y_train: (800,)
Shape of y_test: (200,)


In [35]:
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
linear_reg_model = LinearRegression()

# Train the model using the training data
linear_reg_model.fit(X_train, y_train)

print("Linear Regression model initialized and trained successfully.")

Linear Regression model initialized and trained successfully.


In [36]:
from sklearn.metrics import r2_score, mean_squared_error

# Make predictions on the training data
y_train_pred = linear_reg_model.predict(X_train)

# Calculate R-squared score for the training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for the training set
mse_train = mean_squared_error(y_train, y_train_pred)

print(f"R-squared on training set: {r2_train:.4f}")
print(f"Mean Squared Error on training set: {mse_train:.4f}")

R-squared on training set: 0.8551
Mean Squared Error on training set: 0.0210


In [37]:
from sklearn.metrics import r2_score, mean_squared_error

# Make predictions on the test data
y_test_pred = linear_reg_model.predict(X_test)

# Calculate R-squared score for the test set
r2_test = r2_score(y_test, y_test_pred)

# Calculate Mean Squared Error (MSE) for the test set
mse_test = mean_squared_error(y_test, y_test_pred)

print(f"R-squared on test set: {r2_test:.4f}")
print(f"Mean Squared Error on test set: {mse_test:.4f}")

R-squared on test set: 0.9002
Mean Squared Error on test set: 0.0134


In [38]:
print("\n--- Model Performance Summary ---")
print(f"Training R-squared: {r2_train:.4f}")
print(f"Training Mean Squared Error: {mse_train:.4f}")
print(f"Test R-squared: {r2_test:.4f}")
print(f"Test Mean Squared Error: {mse_test:.4f}")

print("\nInterpretation:")
print("The R-squared values indicate the proportion of variance in the dependent variable that is predictable from the independent variables. A higher R-squared suggests a better fit.")
print("The Mean Squared Error (MSE) measures the average of the squares of the errors or deviations. A lower MSE indicates a better model performance.")
print("Our model shows good performance on both the training and test sets, with slightly better performance on the test set for R-squared, and lower MSE on the test set, suggesting good generalization.")


--- Model Performance Summary ---
Training R-squared: 0.8551
Training Mean Squared Error: 0.0210
Test R-squared: 0.9002
Test Mean Squared Error: 0.0134

Interpretation:
The R-squared values indicate the proportion of variance in the dependent variable that is predictable from the independent variables. A higher R-squared suggests a better fit.
The Mean Squared Error (MSE) measures the average of the squares of the errors or deviations. A lower MSE indicates a better model performance.
Our model shows good performance on both the training and test sets, with slightly better performance on the test set for R-squared, and lower MSE on the test set, suggesting good generalization.


In [42]:
import pickle

print("Pickle module imported.")

Pickle module imported.


In [40]:
with open('linear_reg_model.pkl', 'wb') as file:
    pickle.dump(linear_reg_model, file)

print("Model saved successfully as 'linear_reg_model.pkl'")

Model saved successfully as 'linear_reg_model.pkl'


In [41]:
import pandas as pd
import numpy as np
import pickle

def predict_profit(rd_spend, administration, marketing_spend, state):
    # Load the trained model
    with open('linear_reg_model.pkl', 'rb') as file:
        model = pickle.load(file)

    # Create a DataFrame from the input features
    input_data = pd.DataFrame([[rd_spend, administration, marketing_spend, state]],
                              columns=['R&D Spend', 'Administration', 'Marketing Spend', 'State'])

    # Apply logarithmic transformation to numerical features
    columns_to_transform = ['R&D Spend', 'Administration', 'Marketing Spend']
    for col in columns_to_transform:
        input_data[col] = np.log1p(input_data[col])

    # One-hot encode the 'State' column
    # Ensure consistent column order as during training
    state_columns = ['State_California', 'State_Florida', 'State_New York']
    for col in state_columns:
        input_data[col] = False

    if f'State_{state}' in state_columns:
        input_data[f'State_{state}'] = True

    # Drop the original 'State' column
    input_data = input_data.drop('State', axis=1)

    # Reorder columns to match the training data (X_train) if necessary. This assumes X_train's column order was:
    # ['R&D Spend', 'Administration', 'Marketing Spend', 'State_California', 'State_Florida', 'State_New York']
    # If the order is different, adjust this line accordingly.
    expected_columns = ['R&D Spend', 'Administration', 'Marketing Spend', 'State_California', 'State_Florida', 'State_New York']
    input_data = input_data[expected_columns]

    # Predict the profit (which is log-transformed)
    log_profit_prediction = model.predict(input_data)[0]

    # Inverse transform the prediction to get the original profit scale
    profit_prediction = np.expm1(log_profit_prediction)

    return profit_prediction

print("The 'predict_profit' function has been defined.")

The 'predict_profit' function has been defined.


In [43]:
import gradio as gr

print("Gradio library imported as gr.")

Gradio library imported as gr.


In [44]:
input_components = [
    gr.Number(label='R&D Spend', value=160000), # Example value
    gr.Number(label='Administration', value=120000), # Example value
    gr.Number(label='Marketing Spend', value=400000), # Example value
    gr.Dropdown(['New York', 'California', 'Florida'], label='State', value='New York')
]

output_component = gr.Number(label='Predicted Profit')

interface = gr.Interface(
    fn=predict_profit,
    inputs=input_components,
    outputs=output_component,
    title='Startup Profit Prediction Model'
)

print("Gradio interface created.")

Gradio interface created.


In [45]:
interface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ba472120ff2883ee7f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


