## Pandas Tutorial_part3

- Reading & Writing **CSV** and **Excel** files in Pandas:

In [3]:
# Let's first read a stock CSV file:
import pandas as pd
df = pd.read_csv("stock_data.csv")
df

Unnamed: 0,tickers,eps,revenue,price,people
0,GOOGL,27.82,87,845,larry page
1,WMT,4.61,484,65,n.a.
2,MSFT,-1,85,64,bill gates
3,RIL,not available,50,1023,mukesh ambani
4,TATA,5.6,-1,n.a.,ratan tata


In [10]:
# Now sometimes your raw data has some extra header, and you don't need that extra header in jupyter, so to remove that, we can use skiprows and header arguments:
# Let's 1st check skipwors argument:
df = pd.read_csv("stock_data.csv", skiprows=1)
df
# 1=> shows the number of rows to be skip.

Unnamed: 0,GOOGL,27.82,87,845,larry page
0,WMT,4.61,484,65,n.a.
1,MSFT,-1,85,64,bill gates
2,RIL,not available,50,1023,mukesh ambani
3,TATA,5.6,-1,n.a.,ratan tata


In [11]:
# using header argument:
df = pd.read_csv("stock_data.csv", header=1)
df
# 1=> shows the number of rows to be skip.

Unnamed: 0,GOOGL,27.82,87,845,larry page
0,WMT,4.61,484,65,n.a.
1,MSFT,-1,85,64,bill gates
2,RIL,not available,50,1023,mukesh ambani
3,TATA,5.6,-1,n.a.,ratan tata


In [12]:
# If you have missed header in your original data, you can add it through header=None and pass the header names...
df = pd.read_csv("stock_data.csv", header=None, names=["tickers", "eps", "revenue", "price", "people"])
df

Unnamed: 0,tickers,eps,revenue,price,people
0,tickers,eps,revenue,price,people
1,GOOGL,27.82,87,845,larry page
2,WMT,4.61,484,65,n.a.
3,MSFT,-1,85,64,bill gates
4,RIL,not available,50,1023,mukesh ambani
5,TATA,5.6,-1,n.a.,ratan tata


In [13]:
# You can print specific rows from your dataset using nrows argument:
df = pd.read_csv("stock_data.csv", nrows=3)
df
# 3=> means, three rows will be printed.

Unnamed: 0,tickers,eps,revenue,price,people
0,GOOGL,27.82,87,845,larry page
1,WMT,4.61,484,65,n.a.
2,MSFT,-1.0,85,64,bill gates


In [34]:
# If you've some empty cells or cells which doesn't give any meaning in your dataset, then you can provide specific name for them, it's pretty good for analysing messy data:
# Here we provide NaN values for such cells.
df = pd.read_csv("stock_data.csv", na_values=["n.a.", "not available"])
df

Unnamed: 0,tickers,eps,revenue,price,people
0,GOOGL,27.82,87,845.0,larry page
1,WMT,4.61,484,65.0,
2,MSFT,-1.0,85,64.0,bill gates
3,RIL,,50,1023.0,mukesh ambani
4,TATA,5.6,-1,,ratan tata


In [35]:
# Now other problem in our data set is that we have negative values for revenue columns, so as we know revenue can't be negative, it may be zeor or positive. so we can convert (-1) to NaN also:
df1 = pd.read_csv("stock_data.csv", na_values=["n.a.", "not available", -1])
df1

Unnamed: 0,tickers,eps,revenue,price,people
0,GOOGL,27.82,87.0,845.0,larry page
1,WMT,4.61,484.0,65.0,
2,MSFT,,85.0,64.0,bill gates
3,RIL,,50.0,1023.0,mukesh ambani
4,TATA,5.6,,,ratan tata


In [39]:
# If you look to our previous execution, we were expecting to change the negative revenue to NaN value, we did that, but beside of that we change the (-1) values of "eps" column also, so to prevent that we can supply dictionary instead supplying a list:
df = pd.read_csv("stock_data.csv", na_values={
    'eps':["not available", "n.a."],
    'revenue':["not available", "n.a.", -1],
    'people':["not available", "n.a."],
    'price':["not available", "n.a."]
})
df

Unnamed: 0,tickers,eps,revenue,price,people
0,GOOGL,27.82,87.0,845.0,larry page
1,WMT,4.61,484.0,65.0,
2,MSFT,-1.0,85.0,64.0,bill gates
3,RIL,,50.0,1023.0,mukesh ambani
4,TATA,5.6,,,ratan tata


In [40]:
# Writing in CSV file: we will write stock_data to a "new.csv" file:
df.to_csv("new.csv")

In [41]:
# So here we have one observation, it's they write index as well, so to prevent it, we can use index=False statement:
df.to_csv("new.csv", index=False)

In [43]:
df.columns

Index(['tickers', 'eps', 'revenue', 'price', 'people'], dtype='object')

In [45]:
# We can write a specific columns:
df.to_csv("new.csv", index=False, columns=["tickers", "price"])

In [46]:
# You can skip header from writing:
df.to_csv("new.csv", index=False, header=False)

In [48]:
# Reading & Writing in excel files,
df = pd.read_excel("stock_data.xlsx", "Sheet1")
df

Unnamed: 0,tickers,eps,revenue,price,people
0,GOOGL,27.82,87,845,larry page
1,WMT,4.61,484,65,n.a.
2,MSFT,-1,85,64,bill gates
3,RIL,not available,50,1023,mukesh ambani
4,TATA,5.6,-1,n.a.,ratan tata


In [56]:
# We can conver a cell value using function:
def people_convert_cell(cell):
    if cell == "n.a.":
        return "Mansoor"
    return "cell"
df = pd.read_excel("stock_data.xlsx", converters={
    "people":people_convert_cell
})
df
# for other columns we can follow the same strategy...

Unnamed: 0,tickers,eps,revenue,price,people
0,GOOGL,27.82,87,845,cell
1,WMT,4.61,484,65,Mansoor
2,MSFT,-1,85,64,cell
3,RIL,not available,50,1023,cell
4,TATA,5.6,-1,n.a.,cell


In [59]:
# To write in excel file:
df.to_excel("new.xlsx", sheet_name="stocks")

In [61]:
# To preventing index from writing:
df.to_excel("new.xlsx", sheet_name="stocks", index=False)

In [63]:
# To write in specific offset (Position) in excel, we can use startrow and startcol arguments:
df.to_excel("new.xlsx", sheet_name="stocks", startrow=1, startcol=2, index=False)

In [64]:
# Now if you have two Data Frames and want to insert it into two sheets, so we can use ExcelWriter class:
df_stocks = pd.DataFrame({
    'tickers': ['GOOGL', 'WMT', 'MSFT'],
    'price': [845, 65, 64 ],
    'pe': [30.37, 14.26, 30.97],
    'eps': [27.82, 4.61, 2.12]
})

df_weather =  pd.DataFrame({
    'day': ['1/1/2017','1/2/2017','1/3/2017'],
    'temperature': [32,35,28],
    'event': ['Rain', 'Sunny', 'Snow']
})

In [69]:
with pd.ExcelWriter("stock_weather_data.xlsx") as writer:
    df_stocks.to_excel(writer, sheet_name="stock")
    df_weather.to_excel(writer, sheet_name="weather")

In [70]:
with pd.ExcelWriter("stock_weather_data.xlsx") as writer:
    df_stocks.to_excel(writer, sheet_name="stock", index=False)
    df_weather.to_excel(writer, sheet_name="weather", index=False)

In [71]:
# These were all about reading and writing in CSV and Excel files...