In [1]:
import pandas as pd
import numpy as np

# Define the data with numeric labels for sentiment
data = {
    "Social Media Platform": ["Twitter", "Facebook", "Instagram", "Twitter", "Facebook",
                              "Instagram", "Twitter", "Facebook", "Instagram", "Twitter"],
    "Review": ["Love the new update!", "Too many ads now", "Great for sharing photos",
               "Newsfeed algorithm is biased", "Privacy concerns with latest update",
               "Amazing filters!", "Too much spam", "Easy to connect with friends",
               "Stories feature is fantastic", "Customer support lacking"],
    "age": [21, 19, np.nan, 17, 24, np.nan, 30, 19, 16, 31],
    "Sentiment": [1, 0, 1, 0, 0, 1, 0, 1, 1, 0]  # Numeric labels: 1 for Positive, 0 for Negative
}

# Create a DataFrame
df = pd.DataFrame(data)

df

Unnamed: 0,Social Media Platform,Review,age,Sentiment
0,Twitter,Love the new update!,21.0,1
1,Facebook,Too many ads now,19.0,0
2,Instagram,Great for sharing photos,,1
3,Twitter,Newsfeed algorithm is biased,17.0,0
4,Facebook,Privacy concerns with latest update,24.0,0
5,Instagram,Amazing filters!,,1
6,Twitter,Too much spam,30.0,0
7,Facebook,Easy to connect with friends,19.0,1
8,Instagram,Stories feature is fantastic,16.0,1
9,Twitter,Customer support lacking,31.0,0


In [2]:
df.shape

(10, 4)

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer


from sklearn.feature_extraction.text import CountVectorizer

In [4]:
column_transformer = ColumnTransformer(
    transformers=[
        ('platform_ohe', OneHotEncoder(), [0]),  # Assuming 'Social Media Platform' is the first column
        ('review_bow', CountVectorizer(), 1),    # Assuming 'Review' is the second column
        ('age_impute', SimpleImputer(), [2])     # Assuming 'age' is the third column
    ],
    remainder='drop'
)


In [5]:
pd.DataFrame(column_transformer.fit_transform(df).toarray(),columns=column_transformer.get_feature_names_out())

Unnamed: 0,platform_ohe__Social Media Platform_Facebook,platform_ohe__Social Media Platform_Instagram,platform_ohe__Social Media Platform_Twitter,review_bow__ads,review_bow__algorithm,review_bow__amazing,review_bow__biased,review_bow__concerns,review_bow__connect,review_bow__customer,...,review_bow__sharing,review_bow__spam,review_bow__stories,review_bow__support,review_bow__the,review_bow__to,review_bow__too,review_bow__update,review_bow__with,age_impute__age
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,21.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,19.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.125
3,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,24.0
5,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.125
6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,30.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,19.0
8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0
9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,31.0


In [6]:
column_transformer = ColumnTransformer(
    transformers=[
        ('platform_ohe',OneHotEncoder(),['Social Media Platform']),
        ('review_bow',CountVectorizer(),'Review'),
        ('age_impute',SimpleImputer(),['age'])],
    remainder='drop')

In [7]:
pd.DataFrame(column_transformer.fit_transform(df).toarray(),columns=column_transformer.get_feature_names_out())

Unnamed: 0,platform_ohe__Social Media Platform_Facebook,platform_ohe__Social Media Platform_Instagram,platform_ohe__Social Media Platform_Twitter,review_bow__ads,review_bow__algorithm,review_bow__amazing,review_bow__biased,review_bow__concerns,review_bow__connect,review_bow__customer,...,review_bow__sharing,review_bow__spam,review_bow__stories,review_bow__support,review_bow__the,review_bow__to,review_bow__too,review_bow__update,review_bow__with,age_impute__age
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,21.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,19.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.125
3,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,24.0
5,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.125
6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,30.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,19.0
8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0
9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,31.0


#### FeatureUnion

In [9]:
import pandas as pd
import numpy as np

# Generating a random dataset with 10 rows and 4 columns
np.random.seed(42)  # For reproducibility
data = np.random.randn(10, 4)

# Creating a DataFrame and naming the columns
df = pd.DataFrame(data, columns=['f1', 'f2', 'f3', 'y'])

df

Unnamed: 0,f1,f2,f3,y
0,0.496714,-0.138264,0.647689,1.52303
1,-0.234153,-0.234137,1.579213,0.767435
2,-0.469474,0.54256,-0.463418,-0.46573
3,0.241962,-1.91328,-1.724918,-0.562288
4,-1.012831,0.314247,-0.908024,-1.412304
5,1.465649,-0.225776,0.067528,-1.424748
6,-0.544383,0.110923,-1.150994,0.375698
7,-0.600639,-0.291694,-0.601707,1.852278
8,-0.013497,-1.057711,0.822545,-1.220844
9,0.208864,-1.95967,-1.328186,0.196861


In [20]:
from sklearn.pipeline import FeatureUnion, Pipeline

from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler

In [21]:
featureunion = FeatureUnion([
    ('scaler',StandardScaler()),
    ('PCa',PCA(n_components=2))    
])

In [22]:
pd.DataFrame(featureunion.fit_transform(df.drop(columns=['y'])),columns=featureunion.get_feature_names_out())

Unnamed: 0,scaler__f1,scaler__f2,scaler__f3,PCa__pca0,PCa__pca1
0,0.815293,0.41836,0.947878,-1.025659,-0.425413
1,-0.282292,0.302777,1.873701,-1.772532,-0.358223
2,-0.635686,1.239158,-0.156427,-0.327888,1.038742
3,0.432718,-1.721587,-1.410206,1.911072,-0.68996
4,-1.451676,0.963905,-0.598312,0.193153,1.371662
5,2.270396,0.312856,0.371269,-0.51176,-0.891133
6,-0.74818,0.718778,-0.839795,0.48428,1.020731
7,-0.832663,0.233387,-0.29387,0.191723,0.583958
8,0.04908,-0.690119,1.121664,-0.726878,-0.811461
9,0.383011,-1.777515,-1.015903,1.584488,-0.838903


In [24]:
pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('PCa',PCA(n_components=2))    
])

In [25]:
pd.DataFrame(pipeline.fit_transform(df.drop(columns=['y'])),columns=pipeline.get_feature_names_out())

Unnamed: 0,pca0,pca1
0,-0.259406,1.275005
1,-1.074645,1.255828
2,-1.227399,-0.450157
3,2.07922,-0.900035
4,-1.267134,-1.332941
5,0.780321,1.776439
6,-0.627837,-1.074993
7,-0.497471,-0.748148
8,0.143582,0.836302
9,1.950769,-0.637301


In [26]:
import pandas as pd
import numpy as np

In [27]:
# Define the data with numeric labels for sentiment
data = {
    "Social Media Platform": ["Twitter", "Facebook", "Instagram", "Twitter", "Facebook",
                              "Instagram", "Twitter", "Facebook", "Instagram", "Twitter"],
    "Review": ["Love the new update!", "Too many ads now", "Great for sharing photos",
               "Newsfeed algorithm is biased", "Privacy concerns with latest update",
               "Amazing filters!", "Too much spam", "Easy to connect with friends",
               "Stories feature is fantastic", "Customer support lacking"],
    "age": [21, 19, np.nan, 17, 24, np.nan, 30, 19, 16, 31],
    "Sentiment": [1, 0, 1, 0, 0, 1, 0, 1, 1, 0]  # Numeric labels: 1 for Positive, 0 for Negative
}

# Create a DataFrame
df = pd.DataFrame(data)

df

Unnamed: 0,Social Media Platform,Review,age,Sentiment
0,Twitter,Love the new update!,21.0,1
1,Facebook,Too many ads now,19.0,0
2,Instagram,Great for sharing photos,,1
3,Twitter,Newsfeed algorithm is biased,17.0,0
4,Facebook,Privacy concerns with latest update,24.0,0
5,Instagram,Amazing filters!,,1
6,Twitter,Too much spam,30.0,0
7,Facebook,Easy to connect with friends,19.0,1
8,Instagram,Stories feature is fantastic,16.0,1
9,Twitter,Customer support lacking,31.0,0


In [35]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [36]:
X.head()

Unnamed: 0,Social Media Platform,Review,age
0,Twitter,Love the new update!,21.0
1,Facebook,Too many ads now,19.0
2,Instagram,Great for sharing photos,
3,Twitter,Newsfeed algorithm is biased,17.0
4,Facebook,Privacy concerns with latest update,24.0


In [28]:
def count_words(reviews):
    return np.array([len(review.split()) for review in reviews]).reshape(-1,1)

In [29]:
from sklearn.preprocessing import FunctionTransformer

In [30]:
word_count_transformer = FunctionTransformer(count_words)

In [31]:
feature_union = FeatureUnion([
    ('word_count',word_count_transformer),
    ('bag_of_words',CountVectorizer())
])

In [32]:
column_transformer = ColumnTransformer(transformers=[
    ('age_imputer',SimpleImputer(strategy='mean'),['age']),
    ('platform_ohe',OneHotEncoder(),['Social Media Platform']),
    ('review_processing',feature_union,'Review')
],
                                       remainder='drop')

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_selection import SelectKBest, chi2

In [34]:
final_pipeline = Pipeline(steps=[
    ('col_transformer',column_transformer),
    ('scaler',MaxAbsScaler()),
    ('selector',SelectKBest(score_func=chi2,k=10)),
    ('classifier',LogisticRegression())
])

In [37]:
final_pipeline.fit(X,y)

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('cars.csv')

In [3]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [4]:
df.shape

(8128, 5)

In [5]:
df.isnull().sum()

brand            0
km_driven        0
fuel             0
owner            0
selling_price    0
dtype: int64

In [6]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

In [7]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X.shape,y.shape

((8128, 4), (8128,))

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6502, 4), (1626, 4), (6502,), (1626,))

In [9]:
X_train

Unnamed: 0,brand,km_driven,fuel,owner
6518,Tata,2560,Petrol,First Owner
6144,Honda,80000,Petrol,Second Owner
6381,Hyundai,150000,Diesel,Fourth & Above Owner
438,Maruti,120000,Diesel,Second Owner
5939,Maruti,25000,Petrol,First Owner
...,...,...,...,...
5226,Mahindra,120000,Diesel,First Owner
5390,Maruti,80000,Diesel,Second Owner
860,Hyundai,35000,Petrol,First Owner
7603,Maruti,27000,Diesel,First Owner


In [10]:
y

0       450000
1       370000
2       158000
3       225000
4       130000
         ...  
8123    320000
8124    135000
8125    382000
8126    290000
8127    290000
Name: selling_price, Length: 8128, dtype: int64

In [11]:
np.count_nonzero(X_train['brand'].value_counts())

32

In [12]:
np.count_nonzero(X_test['brand'].value_counts())

25

In [13]:
X_train['owner'].value_counts()

First Owner             4240
Second Owner            1684
Third Owner              433
Fourth & Above Owner     141
Test Drive Car             4
Name: owner, dtype: int64

In [14]:
X_test['owner'].value_counts()

First Owner             1049
Second Owner             421
Third Owner              122
Fourth & Above Owner      33
Test Drive Car             1
Name: owner, dtype: int64

In [15]:
pd.set_option('display.max_columns', None)

In [16]:
column_transformer = ColumnTransformer(transformers=[
    ('ohe',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[0,2]),
    ('ordinal',OrdinalEncoder(categories=[['Test Drive Car','Fourth & Above Owner','Third Owner','Second Owner','First Owner']],
                              handle_unknown='use_encoded_value', unknown_value=10),[3]),
    ('scaling',StandardScaler(),[1])
],remainder='passthrough')

In [17]:
column_transformer.set_output(transform='pandas')

In [18]:
X_train_transformed = column_transformer.fit_transform(X_train)
X_train_transformed

Unnamed: 0,ohe__brand_Ambassador,ohe__brand_Ashok,ohe__brand_Audi,ohe__brand_BMW,ohe__brand_Chevrolet,ohe__brand_Daewoo,ohe__brand_Datsun,ohe__brand_Fiat,ohe__brand_Force,ohe__brand_Ford,ohe__brand_Honda,ohe__brand_Hyundai,ohe__brand_Isuzu,ohe__brand_Jaguar,ohe__brand_Jeep,ohe__brand_Kia,ohe__brand_Land,ohe__brand_Lexus,ohe__brand_MG,ohe__brand_Mahindra,ohe__brand_Maruti,ohe__brand_Mercedes-Benz,ohe__brand_Mitsubishi,ohe__brand_Nissan,ohe__brand_Opel,ohe__brand_Peugeot,ohe__brand_Renault,ohe__brand_Skoda,ohe__brand_Tata,ohe__brand_Toyota,ohe__brand_Volkswagen,ohe__brand_Volvo,ohe__fuel_CNG,ohe__fuel_Diesel,ohe__fuel_LPG,ohe__fuel_Petrol,ordinal__owner,scaling__km_driven
6518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,-1.156592
6144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.170496
6381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.370086
438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.855976
5939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,-0.772038
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,0.855976
5390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.170496
860,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,-0.600668
7603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,-0.737764


In [19]:
owner_df_train = pd.DataFrame(pd.concat([X_train['owner'],X_train_transformed['ordinal__owner']],axis=1))
owner_df_train

Unnamed: 0,owner,ordinal__owner
6518,First Owner,4.0
6144,Second Owner,3.0
6381,Fourth & Above Owner,1.0
438,Second Owner,3.0
5939,First Owner,4.0
...,...,...
5226,First Owner,4.0
5390,Second Owner,3.0
860,First Owner,4.0
7603,First Owner,4.0


In [20]:
owner_df_train.value_counts()

owner                 ordinal__owner
First Owner           4.0               4240
Second Owner          3.0               1684
Third Owner           2.0                433
Fourth & Above Owner  1.0                141
Test Drive Car        0.0                  4
dtype: int64

In [21]:
X_test_transformed = column_transformer.transform(X_test)
X_test_transformed

Unnamed: 0,ohe__brand_Ambassador,ohe__brand_Ashok,ohe__brand_Audi,ohe__brand_BMW,ohe__brand_Chevrolet,ohe__brand_Daewoo,ohe__brand_Datsun,ohe__brand_Fiat,ohe__brand_Force,ohe__brand_Ford,ohe__brand_Honda,ohe__brand_Hyundai,ohe__brand_Isuzu,ohe__brand_Jaguar,ohe__brand_Jeep,ohe__brand_Kia,ohe__brand_Land,ohe__brand_Lexus,ohe__brand_MG,ohe__brand_Mahindra,ohe__brand_Maruti,ohe__brand_Mercedes-Benz,ohe__brand_Mitsubishi,ohe__brand_Nissan,ohe__brand_Opel,ohe__brand_Peugeot,ohe__brand_Renault,ohe__brand_Skoda,ohe__brand_Tata,ohe__brand_Toyota,ohe__brand_Volkswagen,ohe__brand_Volvo,ohe__fuel_CNG,ohe__fuel_Diesel,ohe__fuel_LPG,ohe__fuel_Petrol,ordinal__owner,scaling__km_driven
1971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.684606
4664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,3.803144
5448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,-0.000874
3333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.855976
2316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,-0.018011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1149,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,-1.054799
5002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,-0.514983
6008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,-0.274329
2283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,-0.000874


In [22]:
owner_df_test = pd.DataFrame(pd.concat([X_test['owner'],X_test_transformed['ordinal__owner']],axis=1))
owner_df_test

Unnamed: 0,owner,ordinal__owner
1971,Third Owner,2.0
4664,First Owner,4.0
5448,First Owner,4.0
3333,Second Owner,3.0
2316,Second Owner,3.0
...,...,...
1149,First Owner,4.0
5002,First Owner,4.0
6008,First Owner,4.0
2283,First Owner,4.0


In [23]:
owner_df_test.value_counts()

owner                 ordinal__owner
First Owner           4.0               1049
Second Owner          3.0                421
Third Owner           2.0                122
Fourth & Above Owner  1.0                 33
Test Drive Car        0.0                  1
dtype: int64

In [24]:
from sklearn.linear_model import LinearRegression

In [25]:
sacling = StandardScaler()

In [26]:
model_transformer = LinearRegression()

In [27]:
pipeline = Pipeline(steps=[
                   ('preprocessing',column_transformer),
                    ('scaler',sacling),
                   ('model',model_transformer)])

In [28]:
pipeline.fit(X_train,y_train)

In [29]:
y_pred = pipeline.predict(X_test)
y_pred

array([251616., 106816., 584896., ..., 467456., 282592.,  75648.])

In [30]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

-1.084353312511999e+32

In [31]:
from sklearn.metrics import root_mean_squared_error,r2_score

In [32]:
rmse=root_mean_squared_error(y_test,y_pred)

In [33]:
rmse

395462.475959516

In [34]:
np.round(r2_score(y_test,y_pred))

1.0

In [35]:
import pandas as pd
import numpy as np

In [36]:
# Load the Titanic dataset
df = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')[['Survived','Pclass','Age','Fare']]


df.head()

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [37]:
df.shape

(891, 4)

In [38]:
df.isnull().sum()

Survived      0
Pclass        0
Age         177
Fare          0
dtype: int64

In [40]:
X = df.drop(columns=['Fare'])
y = df['Fare']

In [43]:
X.head()

Unnamed: 0,Survived,Pclass,Age
0,0,3,22.0
1,1,1,38.0
2,1,3,26.0
3,1,1,35.0
4,0,3,35.0


In [41]:
X.shape, y.shape

((891, 3), (891,))

In [44]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [45]:
X_train.shape, X_test.shape

((712, 3), (179, 3))