# Appending Data
First, import the necessary packages and load `winequality-red.csv` and `winequality-white.csv`.

In [22]:
# import numpy and pandas
import pandas as pd
import numpy as np

# load red and white wine datasets
red_df = pd.read_csv('winequality-red.csv', sep=';')
white_df = pd.read_csv('winequality-white.csv', sep=';')

#red_df.head() #inspect to check delimiter
red_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
fixed_acidity           1599 non-null float64
volatile_acidity        1599 non-null float64
citric_acid             1599 non-null float64
residual_sugar          1599 non-null float64
chlorides               1599 non-null float64
free_sulfur_dioxide     1599 non-null float64
total_sulfur-dioxide    1599 non-null float64
density                 1599 non-null float64
pH                      1599 non-null float64
sulphates               1599 non-null float64
alcohol                 1599 non-null float64
quality                 1599 non-null int64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [23]:
#white_df.head() #inspect to check delimiter
white_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
fixed_acidity           4898 non-null float64
volatile_acidity        4898 non-null float64
citric_acid             4898 non-null float64
residual_sugar          4898 non-null float64
chlorides               4898 non-null float64
free_sulfur_dioxide     4898 non-null float64
total_sulfur_dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
quality                 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


## Create Color Columns
Create two arrays as long as the number of rows in the red and white dataframes that repeat the value “red” or “white.” NumPy offers really easy way to do this. Here’s the documentation for [NumPy’s repeat](https://docs.scipy.org/doc/numpy/reference/generated/numpy.repeat.html) function. Take a look and try it yourself.

In [24]:
# create color array for red dataframe
color_red = np.repeat('red', 1599)

# create color array for white dataframe
color_white = np.repeat('white', 4898)

Add arrays to the red and white dataframes. Do this by setting a new column called 'color' to the appropriate array. The cell below does this for the red dataframe.

In [25]:
red_df['color'] = color_red
red_df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur-dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


Do the same for the white dataframe and use `head()` to confirm the change.

In [26]:
white_df['color'] = color_white
white_df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,white
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,white
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,white
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,white
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,white


In [27]:
white_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 13 columns):
fixed_acidity           4898 non-null float64
volatile_acidity        4898 non-null float64
citric_acid             4898 non-null float64
residual_sugar          4898 non-null float64
chlorides               4898 non-null float64
free_sulfur_dioxide     4898 non-null float64
total_sulfur_dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
quality                 4898 non-null int64
color                   4898 non-null object
dtypes: float64(11), int64(1), object(1)
memory usage: 497.5+ KB


## Combine DataFrames with Append
Check the documentation for [Pandas' append](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.append.html) function and see if you can use this to figure out how to combine the dataframes. (Bonus: Why aren't we using the [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) method to combine the dataframes?) If you don’t get it, I’ll show you how afterwards. Make sure to save your work in this notebook! You'll come back to this later.

In [28]:
# append dataframes
wine_df = pd.concat([red_df, white_df])

# view dataframe to check for success
wine_df.head()

Unnamed: 0,alcohol,chlorides,citric_acid,color,density,fixed_acidity,free_sulfur_dioxide,pH,quality,residual_sugar,sulphates,total_sulfur-dioxide,total_sulfur_dioxide,volatile_acidity
0,9.4,0.076,0.0,red,0.9978,7.4,11.0,3.51,5,1.9,0.56,34.0,,0.7
1,9.8,0.098,0.0,red,0.9968,7.8,25.0,3.2,5,2.6,0.68,67.0,,0.88
2,9.8,0.092,0.04,red,0.997,7.8,15.0,3.26,5,2.3,0.65,54.0,,0.76
3,9.8,0.075,0.56,red,0.998,11.2,17.0,3.16,6,1.9,0.58,60.0,,0.28
4,9.4,0.076,0.0,red,0.9978,7.4,11.0,3.51,5,1.9,0.56,34.0,,0.7


In [29]:
#inspect wine_df in details to verify concateation
wine_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 4897
Data columns (total 14 columns):
alcohol                 6497 non-null float64
chlorides               6497 non-null float64
citric_acid             6497 non-null float64
color                   6497 non-null object
density                 6497 non-null float64
fixed_acidity           6497 non-null float64
free_sulfur_dioxide     6497 non-null float64
pH                      6497 non-null float64
quality                 6497 non-null int64
residual_sugar          6497 non-null float64
sulphates               6497 non-null float64
total_sulfur-dioxide    1599 non-null float64
total_sulfur_dioxide    4898 non-null float64
volatile_acidity        6497 non-null float64
dtypes: float64(12), int64(1), object(1)
memory usage: 761.4+ KB


In [30]:
wine_df.iloc[6490:]

Unnamed: 0,alcohol,chlorides,citric_acid,color,density,fixed_acidity,free_sulfur_dioxide,pH,quality,residual_sugar,sulphates,total_sulfur-dioxide,total_sulfur_dioxide,volatile_acidity
4891,10.6,0.038,0.32,white,0.99074,5.7,38.0,3.24,6,0.9,0.46,,121.0,0.21
4892,9.7,0.032,0.38,white,0.99298,6.5,29.0,3.29,5,1.3,0.54,,112.0,0.23
4893,11.2,0.039,0.29,white,0.99114,6.2,24.0,3.27,6,1.6,0.5,,92.0,0.21
4894,9.6,0.047,0.36,white,0.9949,6.6,57.0,3.15,5,8.0,0.46,,168.0,0.32
4895,9.4,0.041,0.19,white,0.99254,6.5,30.0,2.99,6,1.2,0.46,,111.0,0.24
4896,12.8,0.022,0.3,white,0.98869,5.5,20.0,3.34,7,1.1,0.38,,110.0,0.29
4897,11.8,0.02,0.38,white,0.98941,6.0,22.0,3.26,6,0.8,0.32,,98.0,0.21


## Save Combined Dataset
Save your newly combined dataframe as `winequality_edited.csv`. Remember, set `index=False` to avoid saving with an unnamed column!

In [33]:
wine_df.to_csv('winequality_edited.csv', index=False)

In [34]:
df_new = pd.read_csv('winequality_edited.csv')
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 14 columns):
alcohol                 6497 non-null float64
chlorides               6497 non-null float64
citric_acid             6497 non-null float64
color                   6497 non-null object
density                 6497 non-null float64
fixed_acidity           6497 non-null float64
free_sulfur_dioxide     6497 non-null float64
pH                      6497 non-null float64
quality                 6497 non-null int64
residual_sugar          6497 non-null float64
sulphates               6497 non-null float64
total_sulfur-dioxide    1599 non-null float64
total_sulfur_dioxide    4898 non-null float64
volatile_acidity        6497 non-null float64
dtypes: float64(12), int64(1), object(1)
memory usage: 710.7+ KB
