### Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

### Reading the values from the dataset

In [2]:
data = pd.read_csv('instagram_reach.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         100 non-null    int64 
 1   S.No               100 non-null    int64 
 2   USERNAME           100 non-null    object
 3   Caption            94 non-null     object
 4   Followers          100 non-null    int64 
 5   Hashtags           100 non-null    object
 6   Time since posted  100 non-null    object
 7   Likes              100 non-null    int64 
dtypes: int64(4), object(4)
memory usage: 6.4+ KB


### Using .strip( ) to remove the string 'hours' from the 'Time since posted' column

In [4]:
time_since_posted = data['Time since posted']
for i in range(0, len(time_since_posted)):
    time_since_posted[i] = int(time_since_posted[i].strip('hours'))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  time_since_posted[i] = int(time_since_posted[i].strip('hours'))


In [5]:
# using drop() to drop the following columns
data = data.drop(['Unnamed: 0', 'S.No', 'USERNAME', 'Caption', 'Hashtags'], axis = 1)
data

Unnamed: 0,Followers,Time since posted,Likes
0,1600,11,139
1,880,2,23
2,255,2,25
3,340,3,49
4,304,3,30
...,...,...,...
95,614,3,31
96,450,3,42
97,182,3,10
98,2039,3,222


In [6]:
df = data.values
X = df[:,:-1]
y = df[:,-1]

In [7]:
# using train_test_split to split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [8]:
scaled_features = StandardScaler().fit(X_train, X_test)

In [9]:
# using linear regression algorithm to predict
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

### Using the mean_squared_error( ) to find the error difference

In [10]:
mse = mean_squared_error(y_pred, y_test)
print(mse)

541.0121910944123
