# Appending Data
First, import the necessary packages and load `winequality-red.csv` and `winequality-white.csv`.

In [72]:
# import numpy and pandas
import numpy as np
import pandas as pd

# load red and white wine datasets
df1 = pd.read_csv('winequality-red.csv' ,sep=';')
df2 = pd.read_csv('winequality-white.csv' ,sep=';')

In [15]:
df1.shape

(1599, 13)

In [16]:
df2.shape

(4898, 13)

## Create Color Columns
Create two arrays as long as the number of rows in the red and white dataframes that repeat the value “red” or “white.” NumPy offers really easy way to do this. Here’s the documentation for [NumPy’s repeat](https://docs.scipy.org/doc/numpy/reference/generated/numpy.repeat.html) function. Take a look and try it yourself.

In [78]:
# create color array for red dataframe
color1 = np.repeat('red',1599)
df1['color'] = color1
# create color array for white dataframe
color2 = np.repeat('white',4898)
df2['color'] = color2

Add arrays to the red and white dataframes. Do this by setting a new column called 'color' to the appropriate array.

Do the same for the white dataframe and use `head()` to confirm the change.

In [79]:
df1.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur-dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


In [80]:
df2.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,white
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,white
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,white
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,white
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,white


## Combine DataFrames with Append
Check the documentation for [Pandas' append](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.append.html) function and see if you can use this to figure out how to combine the dataframes.

In [81]:
# append the dataframes  (There are 3 ways to do this, can you use them all ?)
df1.append(df2 ,ignore_index=True)
# view dataframe to check for success


  df1.append(df2 ,ignore_index=True)


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur-dioxide,density,pH,sulphates,alcohol,quality,color,total_sulfur_dioxide
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red,
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red,
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red,
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red,
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,,0.99114,3.27,0.50,11.2,6,white,92.0
6493,6.6,0.32,0.36,8.0,0.047,57.0,,0.99490,3.15,0.46,9.6,5,white,168.0
6494,6.5,0.24,0.19,1.2,0.041,30.0,,0.99254,2.99,0.46,9.4,6,white,111.0
6495,5.5,0.29,0.30,1.1,0.022,20.0,,0.98869,3.34,0.38,12.8,7,white,110.0


In [82]:
f = pd.concat([df1,df2] , ignore_index=True)

In [83]:
f.isnull().sum()

fixed_acidity              0
volatile_acidity           0
citric_acid                0
residual_sugar             0
chlorides                  0
free_sulfur_dioxide        0
total_sulfur-dioxide    4898
density                    0
pH                         0
sulphates                  0
alcohol                    0
quality                    0
color                      0
total_sulfur_dioxide    1599
dtype: int64

## Scroll to right, you will find a column filled with NaN values. Go watch the next video and get back here to solve the problem (This is necessary for the next tasks !!)

In [84]:
# fix column names (do not use the usual solution we used earlier in the previous tasks)
df1.rename(columns={'total_sulfur-dioxide':'total_sulfur_dioxide'},inplace=True)
# merge the two datasets again after fixing the issue (keep the color column)
f = pd.concat([df1,df2] , ignore_index=True)
f


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


In [85]:
# Confirm your changes
f.isnull().sum()

fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free_sulfur_dioxide     0
total_sulfur_dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
color                   0
dtype: int64

## Save Combined Dataset
Save your newly combined dataframe as `winequality_edited.csv`. Remember, set `index=False` to avoid saving with an unnamed column!

In [86]:
# save the dataframe
f.to_csv('winequality_edited.csv')

In [87]:
f.shape

(6497, 13)

In [None]:
# How many samples are there in the newely saved dataframe?___6497________
# How many columns are there?______13_____ 