In [None]:
#1 Simple Linear
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df=pd.read_csv('FuelConsumptionLine.csv')
df.head()
df.isna().sum()

corr_matrix=df.corr()
sns.heatmap(corr_matrix,cmap='coolwarm',annot=True)

df.columns.to_list()

X=df[['FUEL CONSUMPTION']]
Y=df['COEMISSIONS ']

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,train_size=0.7,random_state=47)

from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,Y_train)
y_pred=model.predict(X_test)

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
mae=mean_absolute_error(Y_test,y_pred)
mse=mean_squared_error(Y_test,y_pred)
r2=r2_score(Y_test,y_pred)
rmse=np.sqrt(mse)

print(f'MAE:{mae}\nRMSE:{rmse}\nR2:{r2}')

plt.scatter(X_test,Y_test,color='gray')
plt.plot(X_test,y_pred,color='red',linewidth=2)

f=float(input('Enter fuel consumption: '))
f=np.array(f).reshape(-1,1)
pred=model.predict(f)
print(f'Predicted Emissions:{pred}')

In [None]:
#2 Multiple Linear
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df=pd.read_csv('FuelConsumptionLine.csv')
df.head()
df.isna().sum()

corr_matrix=df.corr()
sns.heatmap(corr_matrix,cmap='coolwarm',annot=True)

df.columns.to_list()

X=df[['ENGINE SIZE','CYLINDERS','FUEL CONSUMPTION']]
Y=df['COEMISSIONS']

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,train_size=0.7,random_state=47)

from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,Y_train)
y_pred=model.predict(X_test)

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
mae=mean_absolute_error(Y_test,y_pred)
mse=mean_squared_error(Y_test,y_pred)
r2=r2_score(Y_test,y_pred)
rmse=np.sqrt(mse)

print(f'MAE:{mae}\nRMSE:{rmse}\nR2:{r2}')

e=float(input('Engine Size: '))
c=int(input('No. of cylinders: '))
f=float(input('Fuel Consumption: '))
data=[[e,c,f]]
data=np.array(data)
pred=model.predict(data)
print(f'Predicted Emission: {pred}')

In [None]:
#3 Logistic
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df=pd.read_csv('Bank_Personal_Loan_Modelling.csv')
df.head(5)

df.isna().sum()

X=df.drop(columns='Personal Loan')
Y=df['Personal Loan']

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,train_size=0.7,random_state=47)

from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(X_train,Y_train)
y_pred=model.predict(X_test)

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print('Accuracy Score: ',accuracy_score(Y_test,y_pred))
print('Confusion Matrix:\n',confusion_matrix(Y_test,y_pred))
print('Classification Report:\n',classification_report(Y_test,y_pred))

sns.scatterplot(x=X_test['Age'],y=X_test['Income'],hue=Y_test,palette={0:'lightgray',1:'green'},marker='o')

sns.scatterplot(x=X_test['Age'],y=X_test['Income'],hue=y_pred,palette={0:'lightgray',1:'green'},marker='o')

age=int(input('Enter Age:'))
exp=int(input('Enter Experience:'))
inc=int(input('Enter Income:'))
fam=int(input('Enter no. of family members:'))
cc=float(input('Enter CC Avg:'))
ed=int(input('Enter Education:'))
mor=int(input('Enter Mortgage:'))
sec=int(input('Securities Account 0/1: '))
cd=int(input('CD Account 0/1: '))
on=int(input('Online 0/1: '))
cr=int(input('Credit Card 0/1: '))
data=[[age,exp,inc,fam,cc,ed,mor,sec,cd,on,cr]]
data=np.array(data)
pred=model.predict(data)
if pred==1:
    print('Personal Loan Eligible')
if pred==0:
    print("Personal Loan Not Eligible")

In [None]:
#4 time series Python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from dateutil.parser import parse
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import STL

tdata = pd.read_csv('AirPassengers.csv')

x=tdata['Month']
y=tdata['#Passengers']

plt.figure(figsize=(15,4), dpi=1000)
plt.plot(x, y, color='tab:red')
plt.gca().set(title='No. of Passengers', xlabel='Date', ylabel='Passenger Count')
plt.show()

# Multiplicative Decomposition
multiplicative_decomposition = seasonal_decompose(tdata['#Passengers'], model='multiplicative', period=30)

# Additive Decomposition
additive_decomposition = seasonal_decompose(tdata['#Passengers'], model='additive', period=30)

# Plot
plt.rcParams.update({'figure.figsize': (16,12)})
multiplicative_decomposition.plot().suptitle('Multiplicative Decomposition', fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

additive_decomposition.plot().suptitle('Additive Decomposition', fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

plt.show()

fig, ax1 = plt.subplots(figsize=(8, 3), dpi=100)
plot_acf(tdata['#Passengers'].tolist(), lags=50, ax=ax1)

fig, ax2 = plt.subplots(figsize=(8, 3), dpi=100)
plot_pacf(tdata['#Passengers'].tolist(), lags=50, ax=ax2)

plt.show()

pass_data = pd.DataFrame({
    'Month': pd.date_range(start='1949-01', periods=12*12, freq='M'),
    'Passengers': tdata['#Passengers'] * 20
})

pass_data['Year'] = pass_data['Month'].dt.year

plt.figure(figsize=(10, 6))

for year, group in pass_data.groupby('Year'):
    plt.plot(group['Month'].dt.month, group['Passengers'], label=str(year))

plt.title('Seasonal Passengers Plot ')
plt.xlabel('Month')
plt.ylabel('Passengers')
plt.grid(True)
plt.legend(title='Year', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.tight_layout()
plt.show()

tdata['Date'] = pd.to_datetime(tdata['Month'])
tdata.set_index('Date', inplace=True)

plt.figure(figsize=(10, 6))
plt.plot(tdata.index, tdata['#Passengers'], label='Actual Data', color='blue')
plt.title('Passengers Trend Plot')
plt.xlabel('Date')
plt.ylabel('Passengers')
plt.legend()
plt.grid(True)

rolling_mean = tdata['#Passengers'].rolling(window=12).mean()
plt.plot(tdata.index, rolling_mean, label='12-Month Rolling Mean', color='red')

plt.tight_layout()
plt.show()

stl = STL(tdata['#Passengers'])
result = stl.fit()

plt.figure(figsize=(10, 6))
# plt.plot(result.trend.index, result.trend, label='Trend', color='blue')
plt.plot(result.seasonal.index, result.seasonal, label='Cyclic', color='red')
plt.title('Cyclic Component Plot')
plt.xlabel('Date')
plt.ylabel('Magnitude')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
#4 Time series in R
data(AirPassengers)
summary(AirPassengers)

plot(AirPassengers)
abline(reg=lm(AirPassengers~time(AirPassengers)))

cycle(AirPassengers)

plot(aggregate(AirPassengers,FUN=mean))

boxplot(AirPassengers~cycle(AirPassengers))

install.packages('forecast')
library(forecast)

passengers_ts <- ts(AirPassengers, frequency = 12)
autoplot(passengers_ts) +
xlab("Year") +
ylab("Passenger Count") +
ggtitle("AirPassengers Time Series Data")

ggseasonplot(passengers_ts) +
ggtitle("Seasonal Plot of AirPassengers Data")

ggsubseriesplot(passengers_ts) +
 ggtitle("Trend Plot of AirPassengers Data")

ggsubseriesplot(diff(log(passengers_ts))) +
ggtitle("Cyclic Plot of AirPassengers Data")

myts <- ts(AirPassengers, start=c(1949, 1), end=c(1960, 12), frequency=12)

# subset the time series (June 2014 to December 2014)
myts2 <- window(myts, start=c(1960, 6), end=c(1960, 12))

# plot series
plot(myts)

tsdata <- ts(AirPassengers, frequency = 12)

ddata <- decompose(tsdata, "multiplicative")

plot(ddata)

plot(ddata$trend)
plot(ddata$seasonal)
plot(ddata$random)

acf(diff(log(AirPassengers)))
pacf(diff(log(AirPassengers)))

In [None]:
#5 ARIMA python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
import statsmodels.tsa.api as smt

tdata = pd.read_csv('AirPassengers.csv')

tdata['Date'] = pd.to_datetime(tdata['Month'])
tdata = tdata.drop(columns = 'Month')
tdata = tdata.set_index('Date')
tdata = tdata.rename(columns = {'#Passengers':'Passengers'})
tdata.head()

def test_stationarity(timeseries):
    #Determing rolling statistics
    MA = timeseries.rolling(window=12).mean()
    MSTD = timeseries.rolling(window=12).std()

    #Plot rolling statistics:
    plt.figure(figsize=(15,5))
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(MA, color='red', label='Rolling Mean')
    std = plt.plot(MSTD, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)

    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)

def tsplot(y, lags=None, figsize=(12, 7), style='bmh'):
    if not isinstance(y, pd.Series):
        y = pd.Series(y)

    with plt.style.context(style):
        fig = plt.figure(figsize=figsize)
        layout = (2, 2)
        ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
        acf_ax = plt.subplot2grid(layout, (1, 0))
        pacf_ax = plt.subplot2grid(layout, (1, 1))

        y.plot(ax=ts_ax)
        p_value = sm.tsa.stattools.adfuller(y)[1]
        ts_ax.set_title('Time Series Analysis Plots\n Dickey-Fuller: p={0:.5f}'.format(p_value))
        smt.graphics.plot_acf(y, lags=lags, ax=acf_ax)
        smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax)
        plt.tight_layout()

plt.figure(figsize = (15,5))
tdata['Passengers'].plot()

dec = sm.tsa.seasonal_decompose(tdata['Passengers'],period = 12, model = 'multiplicative').plot()
plt.show()

test_stationarity(tdata['Passengers'])

tdata_diff = tdata.diff()
tdata_diff = tdata_diff.dropna()

dec = sm.tsa.seasonal_decompose(tdata_diff,period = 12).plot()
plt.show()

test_stationarity(tdata_diff)

tsplot(tdata['Passengers'])

tsplot(tdata_diff['Passengers'])

model = sm.tsa.arima.ARIMA(tdata['Passengers'],order = (2,1,2))
model_fit = model.fit()
print(model_fit.summary())

size = int(len(tdata) - 30)
train, test = tdata['Passengers'][0:size], tdata['Passengers'][size:len(tdata)]

print('\t ARIMA MODEL : In- Sample Forecasting \n')

history = [x for x in train]
predictions = []

for t in range(len(test)):

    model = sm.tsa.arima.ARIMA(history, order=(2,1,2))
    model_fit = model.fit()

    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(float(yhat))

    obs = test[t]
    history.append(obs)

    print('predicted = %f, expected = %f' % (yhat, obs))

predictions_series = pd.Series(predictions, index = test.index)
fig,ax = plt.subplots(nrows = 1,ncols = 1,figsize = (15,5))

plt.subplot(1,1,1)
plt.plot(tdata['Passengers'],label = 'Expected Values')
plt.plot(predictions_series,label = 'Predicted Values');
plt.legend(loc="upper left")
plt.show()

error = np.sqrt(mean_squared_error(test,predictions))
print('Test RMSE: %.4f' % error)


In [None]:
#5 ARIMA in R
data(AirPassengers)
summary(AirPassengers)

plot(AirPassengers)
abline(reg=lm(AirPassengers~time(AirPassengers)))

install.packages('forecast')
library(forecast)

mymodel <- auto.arima(AirPassengers)

mymodel

plot.ts(mymodel$residuals)

myforecast <- forecast(mymodel, level=c(95), h=10*12)

plot(myforecast)

Box.test(mymodel$resid, lag=5, type="Ljung-Box")

In [None]:
#6 Spam Filter
import pandas as pd

text=pd.read_csv('mail_data-Copy1.csv')
text.head(5)

text['Category'].value_counts(normalize=True).plot(kind='bar')

text['Category']=text['Category'].astype('category').cat.codes

X=text['Message']
Y=text['Category']

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=47)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
xv_train=tfidf.fit_transform(X_train)
xv_test=tfidf.transform(X_test)

from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(xv_train,Y_train)
y_pred=model.predict(xv_test)

from sklearn.metrics import accuracy_score
print('Accuracy: ',accuracy_score(Y_test,y_pred))

input1=input('Enter msg:')

new_msg={'text':[input1]}
new_df=pd.DataFrame(new_msg)

x_test_new=new_df['text']
xv_test_new=tfidf.transform(x_test_new)
pred=model.predict(xv_test_new)
if pred==0:
    print('message: ham')
if pred==1:
    print('message: spam')
    
#Enter msg:As a valued customer, I am pleased to inform you that on checking your Mob no. you have won $1200 bonus prize, please call on 123456788
#message: spam

In [None]:
# 7 Sentiment Analysis
import pandas as pd

text=pd.read_csv('mail_data-Copy1.csv')
text.head(5)

text['Sentiment'].value_counts(normalize=True).plot(kind='bar')

text['Sentiment']=text['Sentiment'].astype('category').cat.codes

X=text['Review']
Y=text['Sentiment']

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=47)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
xv_train=tfidf.fit_transform(X_train)
xv_test=tfidf.transform(X_test)

from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(xv_train,Y_train)
y_pred=model.predict(xv_test)

from sklearn.metrics import accuracy_score
print('Accuracy: ',accuracy_score(Y_test,y_pred))

input1=input('Enter review:')

new_msg={'text':[input1]}
new_df=pd.DataFrame(new_msg)

x_test_new=new_df['text']
xv_test_new=tfidf.transform(x_test_new)
pred=model.predict(xv_test_new)
if pred==0:
    print('Sentiment: Negative')
if pred==1:
    print('Sentiment: Positive')

In [None]:
# 8 Word Cloud

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

text=pd.read_csv('/content/drive/MyDrive/Youtube04-Eminem.csv')
text.head(5)

r=text['CONTENT'].str.cat(sep='')

from wordcloud import WordCloud
wc=WordCloud(collocations=False,background_color='white',width=2048,height=1080).generate(r)
plt.imshow(wc,interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# 10 DV in R

data(iris)
head(iris)

install.packages('ggplot2')

library(ggplot2)

#Scatter
ggplot(data=iris,aes(x=Sepal.Width, y=Sepal.Length , color = Species)) + geom_point() + theme_minimal() + labs(title = "Sepal Length vs. Sepal Width", x = "sepal_length", y = "sepal_width")

#scatter with trend line
ggplot(data=iris,aes(x=Sepal.Width, y=Sepal.Length,color=Species)) + geom_point() +geom_smooth() + theme_minimal()

#box plot
ggplot(data = iris, aes(x = Species, y = Sepal.Length, fill = Species)) + geom_boxplot() + labs(title = "Boxplot of Sepal Length by Species", x = "Species", y = "Sepal.Length")

#histogram
ggplot(data=iris, aes(x = Petal.Length, fill = Species)) +
geom_histogram(binwidth = 0.5, position = "identity", alpha = 0.5) +
labs(title = "Histogram of Petal Length by Species", x = "petal_length", y = "Frequency") +
facet_wrap(~ Species, ncol = 1)


ggplot(data=iris,aes(x=Sepal.Length,fill=Species)) +
  geom_histogram() +
    theme_minimal() +
    facet_wrap(~Species)

#violin
ggplot(data=iris, aes(x = Species, y = Sepal.Length, fill = Species)) +
geom_violin() +
labs(title = "Violin plot of Sepal Length by Species", x = "species", y = "sepal_length")

#density plot
ggplot(data=iris, aes(x = Petal.Length, fill = Species)) +
geom_density(alpha = 0.5) +
labs(title = "Density plot of Petal Length by Species", x = "petal_length", y = "Density") +
facet_wrap(~ Species, ncol = 1)

ggplot(data=iris,aes(x=Sepal.Length,color=Species)) +
  geom_density() +
    theme_minimal()

#contour
ggplot(data=iris,aes(x=Sepal.Width,y=Sepal.Length,color=Species)) +
  geom_density2d()+
  theme_minimal()



In [1]:
# EDA + DV in Python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = sns.load_dataset('penguins')
data.head(5)

data.columns.to_list()

data.info()

data.describe()

data.shape

data.isna()

data.isna().sum()

data1=data.dropna()

data1.isna().sum()

comat=data.corr()
sns.heatmap(comat,cmap='coolwarm',annot=True)

sns.scatterplot(x=data['bill_length_mm'],y=data['flipper_length_mm'],hue=data['species'],marker='o')

sns.histplot(x=data['body_mass_g'],bins=20,kde=True)

sns.barplot(x=data['species'],y=(data['bill_depth_mm']))

sns.countplot(x=data['island'],hue=data['sex'])

sns.boxplot(x=data['species'],y=data['bill_length_mm'])

sns.violinplot(x=data['species'],y=data['bill_length_mm'])

sns.lineplot(x=data['flipper_length_mm'],y=data['body_mass_g'])

sns.pairplot(data, hue='sex')

#####################################################

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df2, x='bill_length_mm', y='flipper_length_mm', hue='species', alpha=0.8)
plt.title('Scatter Plot : Bill Length vs. Flipper Length')
plt.xlabel('Bill Length (mm)')
plt.ylabel('Flipper Length (mm)')
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(data=df2, x='species', y='flipper_length_mm')
plt.title('Box : Flipper Length by Species')
plt.xlabel('Species')
plt.ylabel('Flipper Length (mm)')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(data=df2, x='body_mass_g', bins=20, kde=True)
plt.title('Histogram : Distribution of Body Mass')
plt.xlabel('Body Mass (g)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 6))
sns.countplot(data=df2, x='species')
plt.title('Bar :Count of Penguins Species')
plt.xlabel('Species')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.violinplot(data=df2, x='species', y='bill_depth_mm')
plt.title('Violin: Bill Depth by Species')
plt.xlabel('Species')
plt.ylabel('Bill Depth (mm)')
plt.tight_layout()
plt.show()

correlation_matrix = df2.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Heatmap: Correlation Matrix of Numerical Variables')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.countplot(data=df2, x='island', hue='sex')
plt.title('Count of Male and Female Penguins per Island')
plt.xlabel('Island')
plt.ylabel('Count')
plt.legend(title='Sex')
plt.tight_layout()
plt.show()


NameError: name 'data' is not defined