### Using python csv package

In [5]:
import csv  # PEP 305 - CSV File API: https://www.python.org/dev/peps/pep-0305/

In [3]:
dir(csv)

['Dialect',
 'DictReader',
 'DictWriter',
 'Error',
 'OrderedDict',
 'QUOTE_ALL',
 'QUOTE_MINIMAL',
 'QUOTE_NONE',
 'QUOTE_NONNUMERIC',
 'Sniffer',
 'StringIO',
 '_Dialect',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '__version__',
 'excel',
 'excel_tab',
 'field_size_limit',
 'get_dialect',
 'list_dialects',
 're',
 'reader',
 'register_dialect',
 'unix_dialect',
 'unregister_dialect',
 'writer']

In [7]:
csv.list_dialects()

['excel', 'excel-tab', 'unix']

In [10]:
with open('./files/users-simple-five.csv','r') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',',quotechar='"')
    for row in csv_reader:
        print(row, type(row))

['-1', '1', 'on the server farm', 'Community'] <class 'list'>
['1', '101', 'New York, NY', 'Adam Lear'] <class 'list'>
['2', '101', 'Corvallis, OR', 'Geoff Dalgas'] <class 'list'>
['3', '101', '', 'hichris123'] <class 'list'>
['4', '101', 'Republic of Texas', 'Ben Collins'] <class 'list'>


In [11]:
with open('./files/users-simple-five.csv','r') as csv_file:
    reader = csv.DictReader(csv_file,fieldnames=['Id','Reputation','Location','DisplayName'])
    for row in reader:
        print(f"{row['DisplayName']} has a reputation of {row['Reputation']}")

Community has a reputation of 1
Adam Lear has a reputation of 101
Geoff Dalgas has a reputation of 101
hichris123 has a reputation of 101
Ben Collins has a reputation of 101


In [12]:
# to load non-quoted as numeric, use the 'quoting' parameter
with open('./files/users-simple-five.csv','r') as file:
    csv_reader = csv.reader(file,quoting=csv.QUOTE_NONNUMERIC)
    for row in csv_reader:
        print(row)

[-1.0, 1.0, 'on the server farm', 'Community']
[1.0, 101.0, 'New York, NY', 'Adam Lear']
[2.0, 101.0, 'Corvallis, OR', 'Geoff Dalgas']
[3.0, 101.0, '', 'hichris123']
[4.0, 101.0, 'Republic of Texas', 'Ben Collins']


In [15]:
# raises error because file was created with quoting=csv.QUOTE_MINIMAL
with open('./files/users-five.csv','r') as file:
    csv_reader = csv.reader(file,quoting=csv.QUOTE_NONNUMERIC)
    for row in csv_reader:
        print(row)

ValueError: could not convert string to float: 'on the server farm'

### Using pandas

In [16]:
import pandas as pd

In [19]:
posts_csv = pd.read_csv('./files/posts-100.csv')
print(type(posts_csv))
posts_csv.head(2)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,5,1,2014-05-13T23:58:30.457,9,448,2014-05-14T00:36:31.077,How can I do simple machine learning without hard-coding behavior?,<machine-learning>,1.1,1.2,1.3,2014-05-14T14:40:25.950
0,7,1,2014-05-14T00:11:06.457,4,388.0,2014-05-16T13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3.0,4,1.0,2014-05-14T08:40:54.950
1,9,2,2014-05-14T00:36:31.077,5,,2014-05-14T00:36:31.077,,,,0,,


In [21]:
remote_file = 'https://raw.githubusercontent.com/xmorera/sample-data/master/csv/posts-100.csv'

In [20]:
# can read from internet: for example - https://raw.githubusercontent.com/xmorera/sample-data/master/csv/posts-100.csv
df = pd.read_csv(remote_file,header=None)
df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,5,1,2014-05-13T23:58:30.457,9,448.0,2014-05-14T00:36:31.077,How can I do simple machine learning without h...,<machine-learning>,1.0,1,1.0,2014-05-14T14:40:25.950
1,7,1,2014-05-14T00:11:06.457,4,388.0,2014-05-16T13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3.0,4,1.0,2014-05-14T08:40:54.950


In [23]:
# can read partial files
df_small = pd.read_csv(remote_file,nrows=3)
df_small

Unnamed: 0,5,1,2014-05-13T23:58:30.457,9,448,2014-05-14T00:36:31.077,How can I do simple machine learning without hard-coding behavior?,<machine-learning>,1.1,1.2,1.3,2014-05-14T14:40:25.950
0,7,1,2014-05-14T00:11:06.457,4,388.0,2014-05-16T13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3.0,4,1.0,2014-05-14T08:40:54.950
1,9,2,2014-05-14T00:36:31.077,5,,2014-05-14T00:36:31.077,,,,0,,
2,10,2,2014-05-14T00:53:43.273,12,,2014-05-14T00:53:43.273,,,,1,,


In [24]:
# can read from middle of file
df_small = pd.read_csv(remote_file,nrows=3,skiprows=3)
df_small

Unnamed: 0,10,2,2014-05-14T00:53:43.273,12,Unnamed: 4,2014-05-14T00:53:43.273.1,Unnamed: 6,Unnamed: 7,Unnamed: 8,1,Unnamed: 10,Unnamed: 11
0,14,1,2014-05-14T01:25:59.677,21,1243,2014-06-20T17:36:05.023,Is Data Science the Same as Data Mining?,<data-mining><definitions>,4,1,4.0,
1,15,1,2014-05-14T01:41:23.110,2,543,2014-05-14T01:41:23.110,What are the advantages and disadvantages of S...,<databases>,0,1,,2014-05-14T07:41:49.437
2,16,1,2014-05-14T01:57:56.880,18,322,2014-05-17T16:24:14.523,Use liblinear on big data for semantic analysis,<machine-learning><bigdata><libsvm>,2,0,,


In [26]:
# can use a function to define which rows to include (using the row index as supplied argument)
df_odd = pd.read_csv(remote_file,skiprows=lambda indx: indx % 2 != 0)
df_odd.head(5)

Unnamed: 0,5,1,2014-05-13T23:58:30.457,9,448,2014-05-14T00:36:31.077,How can I do simple machine learning without hard-coding behavior?,<machine-learning>,1.1,1.2,1.3,2014-05-14T14:40:25.950
0,9,2,2014-05-14T00:36:31.077,5,,2014-05-14T00:36:31.077,,,,0,,
1,14,1,2014-05-14T01:25:59.677,21,1243.0,2014-06-20T17:36:05.023,Is Data Science the Same as Data Mining?,<data-mining><definitions>,4.0,1,4.0,
2,16,1,2014-05-14T01:57:56.880,18,322.0,2014-05-17T16:24:14.523,Use liblinear on big data for semantic analysis,<machine-learning><bigdata><libsvm>,2.0,0,,
3,18,4,2014-05-14T02:49:14.580,0,,2014-05-14T02:49:14.580,,,,0,,
4,20,1,2014-05-14T05:37:46.780,17,311.0,2017-08-29T11:26:37.137,the data on our relational DBMS is getting big...,<nosql><relational-dbms>,5.0,1,1.0,


In [28]:
# specify which columns to load
df_columns = pd.read_csv(remote_file,usecols=[0,6,7,8])
df_columns.head(2)

Unnamed: 0,5,How can I do simple machine learning without hard-coding behavior?,<machine-learning>,1.1
0,7,What open-source books (or other materials) pr...,<education><open-source>,3.0
1,9,,,


In [30]:
df_columns.columns # returns an index of the column names

Index(['5',
       'How can I do simple machine learning without hard-coding behavior?',
       '<machine-learning>', '1.1'],
      dtype='object')

In [31]:
# use header=None to indicate the file does not have a header row, and to auto-generate a column index
df_no_header = pd.read_csv(remote_file,header=None)
df_no_header.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,5,1,2014-05-13T23:58:30.457,9,448.0,2014-05-14T00:36:31.077,How can I do simple machine learning without h...,<machine-learning>,1.0,1,1.0,2014-05-14T14:40:25.950
1,7,1,2014-05-14T00:11:06.457,4,388.0,2014-05-16T13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3.0,4,1.0,2014-05-14T08:40:54.950


In [33]:
# add prefix to auto-generated column index
df_no_header = pd.read_csv(remote_file,header=None,prefix='Col')
print(df_no_header.columns)
df_no_header.head(2)

Index(['Col0', 'Col1', 'Col2', 'Col3', 'Col4', 'Col5', 'Col6', 'Col7', 'Col8',
       'Col9', 'Col10', 'Col11'],
      dtype='object')


Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,Col11
0,5,1,2014-05-13T23:58:30.457,9,448.0,2014-05-14T00:36:31.077,How can I do simple machine learning without h...,<machine-learning>,1.0,1,1.0,2014-05-14T14:40:25.950
1,7,1,2014-05-14T00:11:06.457,4,388.0,2014-05-16T13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3.0,4,1.0,2014-05-14T08:40:54.950


In [35]:
# or provide the column header names
header_names = ['New_Id','New_PostTypeId','New_CreationDate','New_Score']
df = pd.read_csv(remote_file,names=header_names,usecols=[0,1,2,3])
print(df.columns)
df.head(2)

Index(['New_Id', 'New_PostTypeId', 'New_CreationDate', 'New_Score'], dtype='object')


Unnamed: 0,New_Id,New_PostTypeId,New_CreationDate,New_Score
0,5,1,2014-05-13T23:58:30.457,9
1,7,1,2014-05-14T00:11:06.457,4


In [36]:
# import file with header row
df = pd.read_csv('./files/posts-100-header.csv')  # defaults parameter header=infer
print(df.columns)
df.head(2)

Index(['Id', 'PostTypeId', 'CreationDate', 'Score', 'ViewCount',
       'LastActivityDate', 'Title', 'Tags', 'AnswerCount', 'CommentCount',
       'FavoriteCount', 'ClosedDate'],
      dtype='object')


Unnamed: 0,Id,PostTypeId,CreationDate,Score,ViewCount,LastActivityDate,Title,Tags,AnswerCount,CommentCount,FavoriteCount,ClosedDate
0,5,1,2014-05-13T23:58:30.457,9,448.0,2014-05-14T00:36:31.077,How can I do simple machine learning without h...,<machine-learning>,1.0,1,1.0,2014-05-14T14:40:25.950
1,7,1,2014-05-14T00:11:06.457,4,388.0,2014-05-16T13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3.0,4,1.0,2014-05-14T08:40:54.950


In [37]:
# import file with header row - can ignore header row with header=None
df = pd.read_csv('./files/posts-100-header.csv',header=None)
print(df.columns)
df.head(2)

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype='int64')


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,Id,PostTypeId,CreationDate,Score,ViewCount,LastActivityDate,Title,Tags,AnswerCount,CommentCount,FavoriteCount,ClosedDate
1,5,1,2014-05-13T23:58:30.457,9,448,2014-05-14T00:36:31.077,How can I do simple machine learning without h...,<machine-learning>,1,1,1,2014-05-14T14:40:25.950


In [41]:
# header='infer' uses the first row as the column index values ('infer' is the default)
pd.read_csv('./files/posts-100-header.csv',header='infer').columns

Index(['Id', 'PostTypeId', 'CreationDate', 'Score', 'ViewCount',
       'LastActivityDate', 'Title', 'Tags', 'AnswerCount', 'CommentCount',
       'FavoriteCount', 'ClosedDate'],
      dtype='object')

In [39]:
p# header=None generates an index based on integer position
d.read_csv('./files/posts-100-header.csv',header=None).columns

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype='int64')

In [42]:
# header='infer' uses the first row as the column index values - first row used even if it is data
pd.read_csv('./files/posts-100.csv',header='infer').columns

Index(['5', '1', '2014-05-13T23:58:30.457', '9', '448',
       '2014-05-14T00:36:31.077',
       'How can I do simple machine learning without hard-coding behavior?',
       '<machine-learning>', '1.1', '1.2', '1.3', '2014-05-14T14:40:25.950'],
      dtype='object')

In [43]:
# inspect types of imported columns
pd.read_csv('./files/posts-100-header.csv',usecols=[0,1,2,7]).dtypes

Id               int64
PostTypeId       int64
CreationDate    object
Tags            object
dtype: object

In [49]:
# specify the types of the columns on import, using dtype={dictionary}
print(pd.read_csv('./files/posts-100-header.csv',dtype={'PostTypeId': float},usecols=[0,1,2,7]).dtypes)
print('')
print(pd.read_csv('./files/posts-100-header.csv',dtype={'PostTypeId': str},usecols=[0,1,2,7]).dtypes)

Id                int64
PostTypeId      float64
CreationDate     object
Tags             object
dtype: object

Id               int64
PostTypeId      object
CreationDate    object
Tags            object
dtype: object


In [58]:
# Using a converter to process column as it's loaded
import re
df = pd.read_csv('./files/posts-100-header.csv',dtype={'PostTypeId': str},usecols=[0,1,2,7])
print(df.iloc[1])
print()
print(df.iloc[1]['Tags'], type(df.iloc[1]['Tags']))
print()
df = pd.read_csv('./files/posts-100-header.csv',usecols=[0,1,2,7],
                converters={'Tags': lambda x: re.findall('<[A-Za-z0-9_-]*>',x)})
print(df.iloc[1]['Tags'], type(df.iloc[1]['Tags']))
df.head(3)

Id                                     7
PostTypeId                             1
CreationDate     2014-05-14T00:11:06.457
Tags            <education><open-source>
Name: 1, dtype: object

<education><open-source> <class 'str'>

['<education>', '<open-source>'] <class 'list'>


Unnamed: 0,Id,PostTypeId,CreationDate,Tags
0,5,1,2014-05-13T23:58:30.457,[<machine-learning>]
1,7,1,2014-05-14T00:11:06.457,"[<education>, <open-source>]"
2,9,2,2014-05-14T00:36:31.077,[]


In [64]:
# laoding dates: use parse_dates with index(es) or name(s)
df = pd.read_csv('./files/posts-100-header.csv',dtype={'PostTypeId': str},usecols=[0,1,2,7])
print(df.dtypes)
print(type(df['CreationDate'][0]))
print()
df = pd.read_csv('./files/posts-100-header.csv',dtype={'PostTypeId': str},usecols=[0,1,2,7],
                parse_dates=['CreationDate'])
print(df.dtypes)
print(type(df['CreationDate'][0]))

Id               int64
PostTypeId      object
CreationDate    object
Tags            object
dtype: object
<class 'str'>

Id                       int64
PostTypeId              object
CreationDate    datetime64[ns]
Tags                    object
dtype: object
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [72]:
# handling missing values
df = pd.read_csv('./files/posts-100-header.csv',usecols=[0,3,4,8,9,10]) # missing values are converted to nan by default
print(df.head(5))
print()

df = pd.read_csv('./files/posts-100-header.csv',usecols=[0,3,4,8,9,10], na_filter=False) # missing values not converted to nan
print(df.head(5))
print()

#df = pd.read_csv('./files/posts-100-header.csv',usecols=[0,3,4,8,9,10],
#                 dtype={'ViewCount': float}, na_filter=False) # raises exception - can't convert missing to float

df = pd.read_csv('./files/posts-100-header.csv',usecols=[0,3,4,8,9,10],
                 dtype={'ViewCount': float}, na_filter=True) # na_filter=True avoids exception
print(df.head(5))
print()

   Id  Score  ViewCount  AnswerCount  CommentCount  FavoriteCount
0   5      9      448.0          1.0             1            1.0
1   7      4      388.0          3.0             4            1.0
2   9      5        NaN          NaN             0            NaN
3  10     12        NaN          NaN             1            NaN
4  14     21     1243.0          4.0             1            4.0

   Id  Score ViewCount AnswerCount  CommentCount FavoriteCount
0   5      9       448           1             1             1
1   7      4       388           3             4             1
2   9      5                                   0              
3  10     12                                   1              
4  14     21      1243           4             1             4
   Id  Score  ViewCount  AnswerCount  CommentCount  FavoriteCount
0   5      9      448.0          1.0             1            1.0
1   7      4      388.0          3.0             4            1.0
2   9      5        NaN    

### Tab seperated values file

In [77]:
#pd.read_csv('./files/posts-100.tsv').head() # raises exception, trying to use ',' as seperator
pd.read_csv('./files/posts-100.tsv',sep='\t').head()  # can use sep= to define the seperator
pd.read_csv('./files/posts-100.tsv',delimiter='\t').head()  # can also use delimiter=

# alternate pandas function
pd.read_table('./files/posts-100.tsv').head(3)

Unnamed: 0,5,1,2014-05-13T23:58:30.457,9,448,2014-05-14T00:36:31.077,How can I do simple machine learning without hard-coding behavior?,<machine-learning>,1.1,1.2,1.3,2014-05-14T14:40:25.950
0,7,1,2014-05-14T00:11:06.457,4,388.0,2014-05-16T13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3.0,4,1.0,2014-05-14T08:40:54.950
1,9,2,2014-05-14T00:36:31.077,5,,2014-05-14T00:36:31.077,,,,0,,
2,10,2,2014-05-14T00:53:43.273,12,,2014-05-14T00:53:43.273,,,,1,,
