## Auto MPG Data Set
- https://archive.ics.uci.edu/ml/datasets/Auto+MPG
- The data concerns city-cycle fuel consumption in miles per gallon, to be predicted in terms of 3 multivalued discrete and 5 continuous attributes.
- This is a regression problem where we aim to predict the output of a continuous value, ie fuel efficiency.

## Steps perfomed in this SAS notebook:
- Importing Raw Data Files
- Check Data Types of Variables
- Check for Missing Data
- Handle Missing Values
- Check for Duplicate Entries
- Check for Outliers
- Check for Normal Distribution of Variables
- Handle Outliers 
- Check Correlation between Variables
- Log Transformation
- Final Visualizations

## Importing Raw Data Files

In [25]:
libname auto '/folders/myfolders/Project';

proc import Datafile= "~/Project/auto_mpg.csv"
out= auto.original
dbms=csv
replace;
run;

*Ignore error in data import. It is beacuse of '?' values in Horsepower variable;
proc print data= auto.original (obs=5);
run;

Obs,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
1,18,8,307,130,3504,12.0,70,1,chevrolet chevelle malibu
2,15,8,350,165,3693,11.5,70,1,buick skylark 320
3,18,8,318,150,3436,11.0,70,1,plymouth satellite
4,16,8,304,150,3433,12.0,70,1,amc rebel sst
5,17,8,302,140,3449,10.5,70,1,ford torino


## Check Data Types of Variables

In [2]:
proc contents data= auto.original varname;
run;

0,1,2,3
Data Set Name,AUTO.ORIGINAL,Observations,398
Member Type,DATA,Variables,9
Engine,V9,Indexes,0
Created,11/24/2020 02:33:44,Observation Length,96
Last Modified,11/24/2020 02:33:44,Deleted Observations,0
Protection,,Compressed,NO
Data Set Type,,Sorted,NO
Label,,,
Data Representation,"SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64",,
Encoding,utf-8 Unicode (UTF-8),,

Engine/Host Dependent Information,Engine/Host Dependent Information.1
Data Set Page Size,65536
Number of Data Set Pages,1
First Data Page,1
Max Obs per Page,681
Obs in First Data Page,398
Number of Data Set Repairs,0
Filename,/folders/myfolders/Project/original.sas7bdat
Release Created,9.0401M6
Host Created,Linux
Inode Number,748

Variables in Creation Order,Variables in Creation Order,Variables in Creation Order,Variables in Creation Order,Variables in Creation Order,Variables in Creation Order
#,Variable,Type,Len,Format,Informat
1,mpg,Num,8,BEST12.,BEST32.
2,cylinders,Num,8,BEST12.,BEST32.
3,displacement,Num,8,BEST12.,BEST32.
4,horsepower,Num,8,BEST12.,BEST32.
5,weight,Num,8,BEST12.,BEST32.
6,acceleration,Num,8,BEST12.,BEST32.
7,model_year,Num,8,BEST12.,BEST32.
8,origin,Num,8,BEST12.,BEST32.
9,car_name,Char,25,$25.,$25.


## Check for Missing Data

In [3]:
*Output shows 6 missing values in horsepower variable;

proc means data=auto.original n nmiss;
run;

Variable,N,N Miss
mpg cylinders displacement horsepower weight acceleration model_year origin,398 398 398 392 398 398 398 398,0 0 0 6 0 0 0 0


In [4]:
*Output shows no missing values in character variables;

proc format;
value $car
' ' = 'Missing'
other = 'Non Missing';

proc freq data=auto.original;
tables _character_ /nocum missing;
format _character_ $car.;
run;

car_name,Frequency,Percent
Non Missing,398,100.0


In [5]:
*Print the row number for which data is missing in horsepower;

data _null_;
set auto.original;
file print;
if horsepower eq '?' then put
_n_= horsepower=;
run;

In [6]:
proc means data=auto.original mean median;
var horsepower;
run;

Analysis Variable : horsepower,Analysis Variable : horsepower
Mean,Median
104.4693878,93.5


## Handle Missing Values

In [7]:
*Replacing missing values with mean value of horsepower variable;

data auto.updated;
set auto.original;
if horsepower eq '?' then horsepower=104.47;
run;

* Re-check if there are any missing values now;
proc means data=auto.updated n nmiss;
var horsepower;
run;

Analysis Variable : horsepower,Analysis Variable : horsepower
N,N Miss
398,0


## Check for Duplicate Entries

In [8]:
*Log shows 0 duplicates were deleted. So all observations are unique;

proc sort data=auto.updated out=auto.temp3 noduprecs;
by mpg;
run;

## Check for Outliers

In [9]:
*Triming 0.5% values from the top and bottom of the data;

ods output TrimmedMeans=auto.trimmed (keep= Varname mean stdmean df);
proc univariate data = auto.updated noprint trim=0.05 nextrobs=10;
run;
ods output close;

proc print data=auto.trimmed;
run;

Obs,VarName,Mean,StdMean,DF
1,acceleration,15.5198,0.1375,357
2,cylinders,5.405,0.0942,357
3,displacement,187.3,5.6646,357
4,horsepower,101.7,1.9251,357
5,model_year,76.0112,0.2061,357
6,mpg,23.2229,0.4094,357
7,origin,1.5251,0.0447,357
8,weight,2937.2,45.1982,357


In [10]:
*Restructuring the dataset;

data auto.temp;
set auto.updated;
array vars[*] _numeric_;
length VarName $ 32;
do i=1 to dim(vars);
    Varname=vname(vars[i]);
    Value=vars[i];
    output;
end;
keep Varname Value;
run;

proc print data=auto.temp (obs=10);
run;

Obs,VarName,Value
1,mpg,18
2,cylinders,8
3,displacement,307
4,horsepower,130
5,weight,3504
6,acceleration,12
7,model_year,70
8,origin,1
9,mpg,15
10,cylinders,8


In [11]:
*Checking values 3 Standard Deviation away from the mean;

proc sort data=auto.temp;
by varname;
run;

proc sort data=auto.trimmed;
by varname;
run;

data auto.outlier;
merge auto.temp auto.trimmed;
by varname;
std_dev=stdmean*sqrt(df+1);
length Reason $12.;
if value lt mean-3*std_dev then do;
    reason='Low';
    output;
end;
else if value gt mean+3*std_dev then do;
    reason='High';
    output;
end;
run;

* Print the outlier values and the reason;
proc print data=auto.outlier;
var varname value reason;
run;


Obs,VarName,Value,Reason
1,acceleration,23.5,High
2,acceleration,24.8,High
3,acceleration,23.7,High
4,acceleration,24.6,High
5,horsepower,220.0,High
6,horsepower,215.0,High
7,horsepower,225.0,High
8,horsepower,225.0,High
9,horsepower,215.0,High
10,horsepower,215.0,High


In [12]:
*Check outliers using box plot method;

proc sgplot data= auto.updated;
vbox acceleration /datalabel;
run;

In [13]:
proc sgplot data= auto.updated;
vbox horsepower /datalabel;
run;

In [14]:
proc sgplot data= auto.updated;
vbox mpg /datalabel;
run;

In [15]:
proc sgplot data= auto.updated;
vbox displacement /datalabel;
run;

In [16]:
proc sgplot data= auto.updated;
vbox weight /datalabel;
run;

In [17]:
*Detect outliers via IQR method;

proc means data=auto.updated noprint;
var acceleration horsepower mpg;
output out=Tmpo (drop=_type_ _freq_)
Q1=
Q3=
QRange= / autoname;
run;

data _null_;
file print;
set auto.updated(keep=acceleration horsepower mpg);
if _n_ = 1 then set Tmpo;
if acceleration le acceleration_Q1 - 1.5*acceleration_QRange and not missing(acceleration) 
    or acceleration ge acceleration_Q3 + 1.5*acceleration_QRange then
    put "Possible Outlier for acceleration is " acceleration;
else if horsepower le horsepower_Q1 - 1.5*horsepower_QRange and not missing(horsepower) 
    or horsepower ge horsepower_Q3 + 1.5*horsepower_QRange then
    put "Possible Outlier for horsepower is " horsepower;
else if mpg le mpg_Q1 - 1.5*mpg_QRange and not missing(mpg) 
    or mpg ge mpg_Q3 + 1.5*mpg_QRange then
    put "Possible Outlier for mpg is " mpg;
run;

## Handle Outliers

In [18]:
*Only 18 observations have significant outlier values among total 398 observations;
*So the outlier values can be deleted without much impacting the overall dataset;

proc means data=auto.updated noprint;
var acceleration horsepower mpg;
output out=Tmpo (drop=_type_ _freq_)
Q1=
Q3=
QRange= / autoname;
run;

data auto.outlier_removed;
set auto.updated;
if _n_ = 1 then set Tmpo;
if acceleration le acceleration_Q1 - 1.5*acceleration_QRange and not missing(acceleration) 
    or acceleration ge acceleration_Q3 + 1.5*acceleration_QRange then delete;
else if horsepower le horsepower_Q1 - 1.5*horsepower_QRange and not missing(horsepower) 
    or horsepower ge horsepower_Q3 + 1.5*horsepower_QRange then delete;
else if mpg le mpg_Q1 - 1.5*mpg_QRange and not missing(mpg) 
    or mpg ge mpg_Q3 + 1.5*mpg_QRange then delete;
    
keep mpg cylinders displacement horsepower weight acceleration model_year origin car_name;
run;

proc means data=auto.outlier_removed;
run;

Variable,N,Mean,Std Dev,Minimum,Maximum
mpg cylinders displacement horsepower weight acceleration model_year origin,380 380 380 380 380 380 380 380,23.7015789 5.3868421 187.4460526 101.4495263 2933.17 15.6384211 76.1526316 1.5842105,7.5097402 1.6661572 98.1339836 33.0483558 822.7597283 2.4885697 3.6172948 0.8094189,9.0000000 3.0000000 68.0000000 46.0000000 1613.00 9.5000000 70.0000000 1.0000000,44.6000000 8.0000000 429.0000000 198.0000000 5140.00 22.2000000 82.0000000 3.0000000


## Check for Normal Distribution of Variables

In [19]:
*Check Q-Q and probability plots for checking distribution of variables;

proc univariate data=auto.outlier_removed;
var mpg displacement horsepower weight acceleration;
ppplot;
qqplot;
histogram/ normal kernel;
run;

Moments,Moments.1,Moments.2,Moments.3
N,380.0,Sum Weights,380.0
Mean,23.7015789,Sum Observations,9006.6
Std Deviation,7.50974021,Variance,56.396198
Skewness,0.3901323,Kurtosis,-0.6613074
Uncorrected SS,234844.8,Corrected SS,21374.1591
Coeff Variation,31.6845567,Std Error Mean,0.38524154

Basic Statistical Measures,Basic Statistical Measures,Basic Statistical Measures,Basic Statistical Measures
Location,Location.1,Variability,Variability.1
Mean,23.70158,Std Deviation,7.50974
Median,23.0,Variance,56.3962
Mode,13.0,Range,35.6
,,Interquartile Range,11.0

Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0
Test,Statistic,Statistic.1,p Value,p Value.1
Student's t,t,61.52394,Pr > |t|,<.0001
Sign,M,190.0,Pr >= |M|,<.0001
Signed Rank,S,36195.0,Pr >= |S|,<.0001

Quantiles (Definition 5),Quantiles (Definition 5)
Level,Quantile
100% Max,44.6
99%,41.5
95%,37.0
90%,34.25
75% Q3,29.0
50% Median,23.0
25% Q1,18.0
10%,14.0
5%,13.0
1%,12.0

Extreme Observations,Extreme Observations,Extreme Observations,Extreme Observations
Lowest,Lowest,Highest,Highest
Value,Obs,Value,Obs
9,20,40.9,314
11,111,41.5,295
11,91,43.1,231
12,94,44.3,310
12,92,44.6,313

Parameters for Normal Distribution,Parameters for Normal Distribution,Parameters for Normal Distribution
Parameter,Symbol,Estimate
Mean,Mu,23.70158
Std Dev,Sigma,7.50974

Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution
Test,Statistic,Statistic.1,p Value,p Value.1
Kolmogorov-Smirnov,D,0.07818917,Pr > D,<0.010
Cramer-von Mises,W-Sq,0.48872868,Pr > W-Sq,<0.005
Anderson-Darling,A-Sq,3.20984193,Pr > A-Sq,<0.005

Quantiles for Normal Distribution,Quantiles for Normal Distribution,Quantiles for Normal Distribution
Percent,Quantile,Quantile
Percent,Observed,Estimated
1.0,12.0,6.23131
5.0,13.0,11.34916
10.0,14.0,14.07746
25.0,18.0,18.63634
50.0,23.0,23.70158
75.0,29.0,28.76682
90.0,34.25,33.3257
95.0,37.0,36.054
99.0,41.5,41.17185

Moments,Moments.1,Moments.2,Moments.3
N,380.0,Sum Weights,380.0
Mean,187.446053,Sum Observations,71229.5
Std Deviation,98.1339836,Variance,9630.27874
Skewness,0.70188034,Kurtosis,-0.8299651
Uncorrected SS,17001564.3,Corrected SS,3649875.64
Coeff Variation,52.3531876,Std Error Mean,5.03416714

Basic Statistical Measures,Basic Statistical Measures,Basic Statistical Measures,Basic Statistical Measures
Location,Location.1,Variability,Variability.1
Mean,187.4461,Std Deviation,98.13398
Median,144.5,Variance,9630.0
Mode,97.0,Range,361.0
,,Interquartile Range,155.5

Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0
Test,Statistic,Statistic.1,p Value,p Value.1
Student's t,t,37.23477,Pr > |t|,<.0001
Sign,M,190.0,Pr >= |M|,<.0001
Signed Rank,S,36195.0,Pr >= |S|,<.0001

Quantiles (Definition 5),Quantiles (Definition 5)
Level,Quantile
100% Max,429.0
99%,400.0
95%,355.5
90%,350.0
75% Q3,258.0
50% Median,144.5
25% Q1,102.5
10%,90.0
5%,85.0
1%,70.0

Extreme Observations,Extreme Observations,Extreme Observations,Extreme Observations
Lowest,Lowest,Highest,Highest
Value,Obs,Value,Obs
68,104,400,143
70,318,400,216
70,99,400,218
70,61,429,6
71,118,429,80

Parameters for Normal Distribution,Parameters for Normal Distribution,Parameters for Normal Distribution
Parameter,Symbol,Estimate
Mean,Mu,187.4461
Std Dev,Sigma,98.13398

Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution
Test,Statistic,Statistic.1,p Value,p Value.1
Kolmogorov-Smirnov,D,0.1816688,Pr > D,<0.010
Cramer-von Mises,W-Sq,2.9949971,Pr > W-Sq,<0.005
Anderson-Darling,A-Sq,17.4438142,Pr > A-Sq,<0.005

Quantiles for Normal Distribution,Quantiles for Normal Distribution,Quantiles for Normal Distribution
Percent,Quantile,Quantile
Percent,Observed,Estimated
1.0,70.0,-40.8477
5.0,85.0,26.03
10.0,90.0,61.6823
25.0,102.5,121.2557
50.0,144.5,187.4461
75.0,258.0,253.6364
90.0,350.0,313.2098
95.0,355.5,348.8621
99.0,400.0,415.7398

Moments,Moments.1,Moments.2,Moments.3
N,380.0,Sum Weights,380.0
Mean,101.449526,Sum Observations,38550.82
Std Deviation,33.0483558,Variance,1092.19382
Skewness,0.85653714,Kurtosis,-0.0086479
Uncorrected SS,4324903.89,Corrected SS,413941.457
Coeff Variation,32.5761558,Std Error Mean,1.69534488

Basic Statistical Measures,Basic Statistical Measures,Basic Statistical Measures,Basic Statistical Measures
Location,Location.1,Variability,Variability.1
Mean,101.4495,Std Deviation,33.04836
Median,92.5,Variance,1092.0
Mode,150.0,Range,152.0
,,Interquartile Range,39.0

Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0
Test,Statistic,Statistic.1,p Value,p Value.1
Student's t,t,59.84005,Pr > |t|,<.0001
Sign,M,190.0,Pr >= |M|,<.0001
Signed Rank,S,36195.0,Pr >= |S|,<.0001

Quantiles (Definition 5),Quantiles (Definition 5)
Level,Quantile
100% Max,198.0
99%,190.0
95%,170.0
90%,150.0
75% Q3,115.0
50% Median,92.5
25% Q1,76.0
10%,67.0
5%,62.0
1%,48.0

Extreme Observations,Extreme Observations,Extreme Observations,Extreme Observations
Lowest,Lowest,Highest,Highest
Value,Obs,Value,Obs
46,90,190,60
46,14,190,218
48,310,193,20
48,231,198,6
49,104,198,80

Parameters for Normal Distribution,Parameters for Normal Distribution,Parameters for Normal Distribution
Parameter,Symbol,Estimate
Mean,Mu,101.4495
Std Dev,Sigma,33.04836

Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution
Test,Statistic,Statistic.1,p Value,p Value.1
Kolmogorov-Smirnov,D,0.1411765,Pr > D,<0.010
Cramer-von Mises,W-Sq,1.8438264,Pr > W-Sq,<0.005
Anderson-Darling,A-Sq,10.3894715,Pr > A-Sq,<0.005

Quantiles for Normal Distribution,Quantiles for Normal Distribution,Quantiles for Normal Distribution
Percent,Quantile,Quantile
Percent,Observed,Estimated
1.0,48.0,24.5676
5.0,62.0,47.0898
10.0,67.0,59.0964
25.0,76.0,79.1587
50.0,92.5,101.4495
75.0,115.0,123.7403
90.0,150.0,143.8027
95.0,170.0,155.8092
99.0,190.0,178.3315

Moments,Moments.1,Moments.2,Moments.3
N,380.0,Sum Weights,380.0
Mean,2933.16842,Sum Observations,1114604.0
Std Deviation,822.759728,Variance,676933.571
Skewness,0.56539907,Kurtosis,-0.6778769
Uncorrected SS,3525879078.0,Corrected SS,256557823.0
Coeff Variation,28.0502041,Std Error Mean,42.2066835

Basic Statistical Measures,Basic Statistical Measures,Basic Statistical Measures,Basic Statistical Measures
Location,Location.1,Variability,Variability.1
Mean,2933.168,Std Deviation,822.75973
Median,2764.5,Variance,676934.0
Mode,1985.0,Range,3527.0
,,Interquartile Range,1303.0

Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0
Test,Statistic,Statistic.1,p Value,p Value.1
Student's t,t,69.49535,Pr > |t|,<.0001
Sign,M,190.0,Pr >= |M|,<.0001
Signed Rank,S,36195.0,Pr >= |S|,<.0001

Quantiles (Definition 5),Quantiles (Definition 5)
Level,Quantile
100% Max,5140.0
99%,4952.0
95%,4448.0
90%,4177.5
75% Q3,3522.5
50% Median,2764.5
25% Q1,2219.5
10%,1985.0
5%,1895.0
1%,1760.0

Extreme Observations,Extreme Observations,Extreme Observations,Extreme Observations
Lowest,Lowest,Highest,Highest
Value,Obs,Value,Obs
1613,46,4906,92
1649,131,4952,80
1755,327,4955,34
1760,329,4997,91
1773,45,5140,36

Parameters for Normal Distribution,Parameters for Normal Distribution,Parameters for Normal Distribution
Parameter,Symbol,Estimate
Mean,Mu,2933.168
Std Dev,Sigma,822.7597

Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution
Test,Statistic,Statistic.1,p Value,p Value.1
Kolmogorov-Smirnov,D,0.09099272,Pr > D,<0.010
Cramer-von Mises,W-Sq,1.04159868,Pr > W-Sq,<0.005
Anderson-Darling,A-Sq,6.650986,Pr > A-Sq,<0.005

Quantiles for Normal Distribution,Quantiles for Normal Distribution,Quantiles for Normal Distribution
Percent,Quantile,Quantile
Percent,Observed,Estimated
1.0,1760.0,1019.14
5.0,1895.0,1579.85
10.0,1985.0,1878.76
25.0,2219.5,2378.23
50.0,2764.5,2933.17
75.0,3522.5,3488.11
90.0,4177.5,3987.58
95.0,4448.0,4286.49
99.0,4952.0,4847.19

Moments,Moments.1,Moments.2,Moments.3
N,380.0,Sum Weights,380.0
Mean,15.6384211,Sum Observations,5942.6
Std Deviation,2.48856968,Variance,6.19297903
Skewness,0.26359188,Kurtosis,-0.2052476
Uncorrected SS,95280.02,Corrected SS,2347.13905
Coeff Variation,15.9131773,Std Error Mean,0.12766093

Basic Statistical Measures,Basic Statistical Measures,Basic Statistical Measures,Basic Statistical Measures
Location,Location.1,Variability,Variability.1
Mean,15.63842,Std Deviation,2.48857
Median,15.5,Variance,6.19298
Mode,14.5,Range,12.7
,,Interquartile Range,3.15

Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0,Tests for Location: Mu0=0
Test,Statistic,Statistic.1,p Value,p Value.1
Student's t,t,122.4997,Pr > |t|,<.0001
Sign,M,190.0,Pr >= |M|,<.0001
Signed Rank,S,36195.0,Pr >= |S|,<.0001

Quantiles (Definition 5),Quantiles (Definition 5)
Level,Quantile
100% Max,22.2
99%,21.9
95%,20.0
90%,19.0
75% Q3,17.15
50% Median,15.5
25% Q1,14.0
10%,12.5
5%,11.5
1%,10.5

Extreme Observations,Extreme Observations,Extreme Observations,Extreme Observations
Lowest,Lowest,Highest,Highest
Value,Obs,Value,Obs
9.5,8,21.8,312
10.0,7,21.9,196
10.0,6,22.1,183
10.5,5,22.2,182
11.0,111,22.2,286

Parameters for Normal Distribution,Parameters for Normal Distribution,Parameters for Normal Distribution
Parameter,Symbol,Estimate
Mean,Mu,15.63842
Std Dev,Sigma,2.48857

Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution,Goodness-of-Fit Tests for Normal Distribution
Test,Statistic,Statistic.1,p Value,p Value.1
Kolmogorov-Smirnov,D,0.05112619,Pr > D,0.017
Cramer-von Mises,W-Sq,0.13192489,Pr > W-Sq,0.043
Anderson-Darling,A-Sq,0.82490449,Pr > A-Sq,0.035

Quantiles for Normal Distribution,Quantiles for Normal Distribution,Quantiles for Normal Distribution
Percent,Quantile,Quantile
Percent,Observed,Estimated
1.0,10.5,9.84914
5.0,11.5,11.54509
10.0,12.5,12.44919
25.0,14.0,13.95991
50.0,15.5,15.63842
75.0,17.15,17.31694
90.0,19.0,18.82765
95.0,20.0,19.73175
99.0,21.9,21.4277


## Check Correlation between Variables

In [20]:
*Check pearson correlation for linear and spearman correlation for non linear variables;

proc corr data=auto.outlier_removed pearson spearman plots=matrix(nvar=all histogram);
run;

0,1
8 Variables:,mpg cylinders displacement horsepower weight acceleration model_year origin

Simple Statistics,Simple Statistics,Simple Statistics,Simple Statistics,Simple Statistics,Simple Statistics,Simple Statistics
Variable,N,Mean,Std Dev,Median,Minimum,Maximum
mpg,380,23.70158,7.50974,23.0,9.0,44.6
cylinders,380,5.38684,1.66616,4.0,3.0,8.0
displacement,380,187.44605,98.13398,144.5,68.0,429.0
horsepower,380,101.44953,33.04836,92.5,46.0,198.0
weight,380,2933.0,822.75973,2765.0,1613.0,5140.0
acceleration,380,15.63842,2.48857,15.5,9.5,22.2
model_year,380,76.15263,3.61729,76.0,70.0,82.0
origin,380,1.58421,0.80942,1.0,1.0,3.0

"Pearson Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0","Pearson Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0"
Unnamed: 0_level_1,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
mpg,1.00000,-0.76767 <.0001,-0.80482 <.0001,-0.78057 <.0001,-0.83232 <.0001,0.35788 <.0001,0.54538 <.0001,0.55173 <.0001
cylinders,-0.76767 <.0001,1.00000,0.95501 <.0001,0.84877 <.0001,0.89323 <.0001,-0.46119 <.0001,-0.29657 <.0001,-0.54953 <.0001
displacement,-0.80482 <.0001,0.95501 <.0001,1.00000,0.88482 <.0001,0.93873 <.0001,-0.48179 <.0001,-0.30745 <.0001,-0.60928 <.0001
horsepower,-0.78057 <.0001,0.84877 <.0001,0.88482 <.0001,1.00000,0.87559 <.0001,-0.64670 <.0001,-0.33788 <.0001,-0.45135 <.0001
weight,-0.83232 <.0001,0.89323 <.0001,0.93873 <.0001,0.87559 <.0001,1.00000,-0.37413 <.0001,-0.25531 <.0001,-0.57333 <.0001
acceleration,0.35788 <.0001,-0.46119 <.0001,-0.48179 <.0001,-0.64670 <.0001,-0.37413 <.0001,1.00000,0.21830 <.0001,0.16527 0.0012
model_year,0.54538 <.0001,-0.29657 <.0001,-0.30745 <.0001,-0.33788 <.0001,-0.25531 <.0001,0.21830 <.0001,1.00000,0.14790 0.0039
origin,0.55173 <.0001,-0.54953 <.0001,-0.60928 <.0001,-0.45135 <.0001,-0.57333 <.0001,0.16527 0.0012,0.14790 0.0039,1.00000

"Spearman Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0","Spearman Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0","Spearman Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0","Spearman Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0","Spearman Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0","Spearman Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0","Spearman Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0","Spearman Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0","Spearman Correlation Coefficients, N = 380 Prob > |r| under H0: Rho=0"
Unnamed: 0_level_1,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
mpg,1.00000,-0.80865 <.0001,-0.84579 <.0001,-0.82933 <.0001,-0.86962 <.0001,0.38502 <.0001,0.53784 <.0001,0.56565 <.0001
cylinders,-0.80865 <.0001,1.00000,0.90795 <.0001,0.79195 <.0001,0.86831 <.0001,-0.42377 <.0001,-0.28421 <.0001,-0.59115 <.0001
displacement,-0.84579 <.0001,0.90795 <.0001,1.00000,0.85327 <.0001,0.94651 <.0001,-0.44368 <.0001,-0.24781 <.0001,-0.70101 <.0001
horsepower,-0.82933 <.0001,0.79195 <.0001,0.85327 <.0001,1.00000,0.86401 <.0001,-0.60706 <.0001,-0.32410 <.0001,-0.48413 <.0001
weight,-0.86962 <.0001,0.86831 <.0001,0.94651 <.0001,0.86401 <.0001,1.00000,-0.35886 <.0001,-0.22616 <.0001,-0.61979 <.0001
acceleration,0.38502 <.0001,-0.42377 <.0001,-0.44368 <.0001,-0.60706 <.0001,-0.35886 <.0001,1.00000,0.21549 <.0001,0.17756 0.0005
model_year,0.53784 <.0001,-0.28421 <.0001,-0.24781 <.0001,-0.32410 <.0001,-0.22616 <.0001,0.21549 <.0001,1.00000,0.13034 0.0110
origin,0.56565 <.0001,-0.59115 <.0001,-0.70101 <.0001,-0.48413 <.0001,-0.61979 <.0001,0.17756 0.0005,0.13034 0.0110,1.00000


## Log Transformation

In [21]:
*The log transformation can be used to make highly skewed distributions less skewed;

data auto.log_data;
set auto.outlier_removed;
disp_log = log(displacement);
run;

*Compare the new graphs;
ods select Histogram;
proc univariate data= auto.log_data noprint;
var disp_log displacement;
histogram / kernel normal;
run;

In [22]:
*Apply logarithmic transformation on mpg variable;

data auto.log_data;
set auto.log_data;
mpg_log = log(mpg);
run;

ods select Histogram;
proc univariate data= auto.log_data noprint;
var mpg_log mpg;
histogram / kernel normal;
run;

## Final Visualizations

In [23]:
*Plot scatter plots to visualize the relation between varibles;

proc sgscatter data=auto.outlier_removed;
    matrix mpg cylinders displacement horsepower weight model_year acceleration/ diagonal=(histogram);
run;

In [24]:
*Create 3D visualizations for checking the impact of different variables on mpg;

proc kde data=auto.log_data;
 bivar mpg cylinders / noprint plots = histogram surface;
 bivar mpg displacement / noprint plots = histogram surface;
 bivar mpg horsepower / noprint plots = histogram surface;
 bivar mpg weight / noprint plots = histogram surface;
 bivar mpg model_year / noprint plots = histogram surface;
 bivar mpg acceleration / noprint plots = histogram surface;
run;