Import all necessary modules

In [None]:
from google.colab import drive, files
import pandas as pd
import xml.etree.ElementTree as ET
import yaml
import json

# Loading from my google drive

In [None]:
drive.mount(r'/content/drive/')

Mounted at /content/drive/


## **csv/tsv**

In [None]:
df = pd.read_csv(r'/content/drive/MyDrive/HW2_data/addresses.csv')
df

Unnamed: 0,John,Doe,120 jefferson st.,Riverside,NJ,08075
0,Jack,McGinnis,220 hobo Av.,Phila,PA,9119
1,"John ""Da Man""",Repici,120 Jefferson St.,Riverside,NJ,8075
2,Stephen,Tyler,"7452 Terrace ""At the Plaza"" road",SomeTown,SD,91234
3,,Blankman,,SomeTown,SD,298
4,"Joan ""the bone"", Anne",Jet,"9th, at Terrace plc",Desert City,CO,123


It's possible to add column names while loading dataset

In [None]:
col_names = ['first_name', 'second_name', 'address', 'city', 'district', 'code']
df = pd.read_csv(r'/content/drive/MyDrive/HW2_data/addresses.csv', names=col_names)
df

Unnamed: 0,first_name,second_name,address,city,district,code
0,John,Doe,120 jefferson st.,Riverside,NJ,8075
1,Jack,McGinnis,220 hobo Av.,Phila,PA,9119
2,"John ""Da Man""",Repici,120 Jefferson St.,Riverside,NJ,8075
3,Stephen,Tyler,"7452 Terrace ""At the Plaza"" road",SomeTown,SD,91234
4,,Blankman,,SomeTown,SD,298
5,"Joan ""the bone"", Anne",Jet,"9th, at Terrace plc",Desert City,CO,123


To load tsv, we just need to specify the sep argument

In [None]:
df = pd.read_csv(r'/content/drive/MyDrive/HW2_data/more_adresses.tsv', sep='\t')
df

Unnamed: 0,Name,Age,Address
0,Paul,23,1115 W Franklin
1,Bessy the Cow,5,Big Farm Way
2,Zeke,45,W Main St


## **xml**

Just to see what's inside our xml file

In [None]:
with open(r'/content/drive/MyDrive/HW2_data/pizza.xml') as xml_file:
  data = xml_file.read()
print(data)

<order>
    <crust>original</crust>
    <toppings>
        <topping>cheese</topping>
        <topping>pepperoni</topping>
        <topping>garlic</topping>
    </toppings>
    <status>cooking</status>
</order>


pandas.read_xml works, but the result isn't what we've expected

In [None]:
df = pd.read_xml(r'/content/drive/MyDrive/HW2_data/pizza.xml')
df

Unnamed: 0,crust,topping,status
0,original,,
1,,garlic,
2,,,cooking


It's better to parse xml first

In [None]:
xml_data = open(r'/content/drive/MyDrive/HW2_data/pizza.xml').read()
root = ET.XML(xml_data)

In [None]:
data = []
cols = []
for child in root:
  data.append(child.text.strip() + ' '.join([subchild.text for subchild in child]))
  cols.append(child.tag)

In [None]:
data_dict = dict(zip(cols, data))
df = pd.DataFrame([data_dict])
df

Unnamed: 0,crust,toppings,status
0,original,cheese pepperoni garlic,cooking


## **yaml**

Just to see what's inside our yaml file

In [None]:
with open(r'/content/drive/MyDrive/HW2_data/alice.yml') as yaml_file:
  read_data = yaml_file.read()
print(read_data)

# YAML Document starts with ---
# Comments start with #
- - -
  UserName: Alicia
  Password: pinga123 * 
  phone: 3256
  TablesList:
        -EmployeeTable
        -SoftwaresList
        -HardwareList 
...


Skip lines with comments and deserialize yaml to dict

In [None]:
skiped_lines = [1, 2, 3]
with open(r'/content/drive/MyDrive/HW2_data/alice.yml') as yaml_file:
    for i in skiped_lines:
        _ = yaml_file.readline()
    data = yaml.safe_load(yaml_file)
data

{'UserName': 'Alicia',
 'Password': 'pinga123 *',
 'phone': 3256,
 'TablesList': '-EmployeeTable -SoftwaresList -HardwareList'}

Create dataframe from dict

In [None]:
df = pd.DataFrame(data, index=[0])
df

Unnamed: 0,UserName,Password,phone,TablesList
0,Alicia,pinga123 *,3256,-EmployeeTable -SoftwaresList -HardwareList


## **json**

Load json file and deserialize it to dict



In [None]:
with open(r'/content/drive/MyDrive/HW2_data/joe.json') as json_file:
  data = json.load(json_file)
data

{'firstName': 'Joe',
 'lastName': 'Jackson',
 'gender': 'male',
 'age': 28,
 'address': {'streetAddress': '101', 'city': 'San Diego', 'state': 'CA'},
 'phoneNumbers': [{'type': 'home', 'number': '7349282382'}]}

Create dataframe using json_normalize, which can deal with nested dictionaries

In [None]:
data['phoneNumbers'] = data['phoneNumbers'][0]  # unpack dict from list
df = pd.json_normalize(data, sep='_')
df

Unnamed: 0,firstName,lastName,gender,age,address_streetAddress,address_city,address_state,phoneNumbers_type,phoneNumbers_number
0,Joe,Jackson,male,28,101,San Diego,CA,home,7349282382


If we don't care, it's possible to use pandas.DataFrame.from_dict) But the dataframe below isn't so pretty and convinient to work with

In [None]:
df = pd.DataFrame.from_dict(data, orient='index')
df

Unnamed: 0,0
firstName,Joe
lastName,Jackson
gender,male
age,28
address,"{'streetAddress': '101', 'city': 'San Diego', ..."
phoneNumbers,"{'type': 'home', 'number': '7349282382'}"


# Loading from google colaboratory files (current session)

Actually, loading from google colaboratory isn't very different from loading from google drive

it's possible to use files module to upload files to session storage or files can be add using google colab menu

In [None]:
for _ in range(5):
  files.upload()

Saving addresses.csv to addresses.csv


Saving alice.yml to alice.yml


Saving more_adresses.tsv to more_adresses.tsv


Saving my_json.json to my_json.json


Saving my_xml.xml to my_xml.xml


## **csv/tsv**

For the sake of variety, this time load dataset skipping some rows



In [None]:
col_names = ['first_name', 'second_name', 'address', 'city', 'district', 'code']
df = pd.read_csv(r'/content/addresses.csv', skiprows = [2, 4, 5], names=col_names)
df

Unnamed: 0,first_name,second_name,address,city,district,code
0,John,Doe,120 jefferson st.,Riverside,NJ,8075
1,Jack,McGinnis,220 hobo Av.,Phila,PA,9119
2,Stephen,Tyler,"7452 Terrace ""At the Plaza"" road",SomeTown,SD,91234


lets imagine for some reason we need to use first column as indexes. And its tsv file, so our `sep` argument is `'\t'`

In [None]:
df = pd.read_csv(r'/content/more_adresses.tsv', sep='\t', index_col=0)
df

Unnamed: 0_level_0,Age,Address
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Paul,23,1115 W Franklin
Bessy the Cow,5,Big Farm Way
Zeke,45,W Main St


## **xml**

In [None]:
with open(r'/content/my_xml.xml') as xml_file:
  data = xml_file.read()
print(data)

<data>
	<row>
		<shape>square</shape>
		<degrees>360</degrees>
		<sides>4.0</sides>
	</row>
	<row>
		<shape>circle</shape>
		<degrees>360</degrees>
		<sides>5.0</sides>
	</row>
	<row>
		<shape>triangle</shape>
		<degrees>180</degrees>
		<sides>3.0</sides>
	</row>
</data>


In this example I use an xml file with a different structure, so pd.read_xml works fine

In [None]:
df = pd.read_xml(r'/content/my_xml.xml')
df

Unnamed: 0,shape,degrees,sides
0,square,360,4.0
1,circle,360,5.0
2,triangle,180,3.0


## **yaml**

In [None]:
with open(r'/content/alice.yml') as yaml_file:
  read_data = yaml_file.read()
print(read_data)

# YAML Document starts with ---
# Comments start with #
- - -
  UserName: Alicia
  Password: pinga123 * 
  phone: 3256
  TablesList:
        -EmployeeTable
        -SoftwaresList
        -HardwareList 
...


In [None]:
skiped_lines = [1, 2, 3]
with open(r'/content/alice.yml') as yaml_file:
    for i in skiped_lines:
        _ = yaml_file.readline()
    data = yaml.safe_load(yaml_file)

In [None]:
df = pd.DataFrame.from_dict([data])
df

Unnamed: 0,UserName,Password,phone,TablesList
0,Alicia,pinga123 *,3256,-EmployeeTable -SoftwaresList -HardwareList


## **json**

If json file has the following structure: 

```
{
"column_name": ["value_1", ..., "value_n"],
...
"column_name": ["value_1", ..., "value_n"]
}
```

it's convinient to use pandas.read_json

In [None]:
df = pd.read_json(r'/content/my_json.json')
df

Unnamed: 0,firstName,lastName,age
0,Joe,Jackson,20
1,Tom,Smith,40


## Loading from the Internet

## **!wget command and different methods to unzip**

### !gzip

In [None]:
!wget https://datasets.imdbws.com/name.basics.tsv.gz

--2022-10-15 14:09:39--  https://datasets.imdbws.com/name.basics.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 13.249.98.61, 13.249.98.91, 13.249.98.35, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|13.249.98.61|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 234883890 (224M) [binary/octet-stream]
Saving to: ‘name.basics.tsv.gz’


2022-10-15 14:09:41 (136 MB/s) - ‘name.basics.tsv.gz’ saved [234883890/234883890]



In [None]:
!gzip -d name.basics.tsv.gz

In [None]:
df = pd.read_csv('/content/name.basics.tsv', sep='\t')
df

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0031983,tt0053137,tt0050419,tt0072308"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0075213,tt0117057,tt0038355,tt0037382"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0054452,tt0056404,tt0049189,tt0057345"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0077975,tt0078723,tt0080455,tt0072562"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0060827,tt0083922,tt0050986,tt0069467"
...,...,...,...,...,...,...
12000437,nm9993714,Romeo del Rosario,\N,\N,"animation_department,art_department","tt11657662,tt14069590,tt2455546"
12000438,nm9993716,Essias Loberg,\N,\N,,\N
12000439,nm9993717,Harikrishnan Rajan,\N,\N,cinematographer,tt8736744
12000440,nm9993718,Aayush Nair,\N,\N,cinematographer,\N


### !unzip

Example with dataset of tweets. More info about dataset: http://help.sentiment140.com/for-students/

In [None]:
!wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip

--2022-10-15 14:10:30--  http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip [following]
--2022-10-15 14:10:30--  https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81363704 (78M) [application/zip]
Saving to: ‘trainingandtestdata.zip’


2022-10-15 14:10:33 (31.3 MB/s) - ‘trainingandtestdata.zip’ saved [81363704/81363704]



In [None]:
!unzip trainingandtestdata.zip

Archive:  trainingandtestdata.zip
  inflating: testdata.manual.2009.06.14.csv  
  inflating: training.1600000.processed.noemoticon.csv  


In [None]:
df = pd.read_csv('testdata.manual.2009.06.14.csv')
df

Unnamed: 0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,"@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right."
0,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
1,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
2,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
3,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
4,4,8,Mon May 11 03:22:00 UTC 2009,kindle2,GeorgeVHulme,@richardebaker no. it is too big. I'm quite ha...
...,...,...,...,...,...,...
492,2,14072,Sun Jun 14 04:31:43 UTC 2009,latex,proggit,Ask Programming: LaTeX or InDesign?: submitted...
493,0,14073,Sun Jun 14 04:32:17 UTC 2009,latex,sam33r,"On that note, I hate Word. I hate Pages. I hat..."
494,4,14074,Sun Jun 14 04:36:34 UTC 2009,latex,iamtheonlyjosie,Ahhh... back in a *real* text editing environm...
495,0,14075,Sun Jun 14 21:36:07 UTC 2009,iran,plutopup7,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."


## **git clone**

Example with social media messages dataset. More information about research: https://www.mdpi.com/1999-5903/14/1/4

In [None]:
! git clone https://github.com/afedotowaa/authorship_attribution/

Cloning into 'authorship_attribution'...
remote: Enumerating objects: 278, done.[K
remote: Counting objects: 100% (107/107), done.[K
remote: Compressing objects: 100% (61/61), done.[K
remote: Total 278 (delta 40), reused 101 (delta 37), pack-reused 171[K
Receiving objects: 100% (278/278), 40.26 MiB | 21.44 MiB/s, done.
Resolving deltas: 100% (42/42), done.


In [None]:
df = pd.read_csv(r'/content/authorship_attribution/data/social media/data.csv', names=['author_id', 'message'])
df

Unnamed: 0,author_id,message
0,805156,"Мда, когда-то Тиесто писал норм треки, а не по..."
1,2596709,поскакали??? как теперь при дерьмократии довол...
2,4022849,Кошка принесла котенка значит доверяет хозяйки...
3,4748971,Готов поработать над исследованиями за скромны...
4,7700310,мну больше питика в супермаркеты не берёт 😒 и ...
...,...,...
202887,624887516,Ой сколько я суши и роллов на пол перекидала п...
202888,417017008,"Мне тоже он нравится, спасибо, скоро пойдут, ж..."
202889,624887516,Затирушечка. Самый смак . Даже без мяса с по...
202890,624887516,Все уже прокол.... Не смазывают цыпленка табак...


## **loading data from kaggle**

In [None]:
! pip install -q kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list

ref                                                           title                                                size  lastUpdated          downloadCount  voteCount  usabilityRating  
------------------------------------------------------------  --------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
akshaydattatraykhare/diabetes-dataset                         Diabetes Dataset                                      9KB  2022-10-06 08:55:25           2704         78  1.0              
whenamancodes/covid-19-coronavirus-pandemic-dataset           COVID -19 Coronavirus Pandemic Dataset               11KB  2022-09-30 04:05:11           2353         68  1.0              
evangower/premier-league-matches-19922022                     Premier League Matches 1992-2022                     78KB  2022-10-03 02:18:33           1035         42  1.0              
whenamancodes/student-performance                             Student 

In [None]:
! kaggle competitions download -c 'titanic'
! unzip titanic.zip

Downloading titanic.zip to /content
  0% 0.00/34.1k [00:00<?, ?B/s]
100% 34.1k/34.1k [00:00<00:00, 25.8MB/s]
Archive:  titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [None]:
df = pd.read_csv(r'train.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C
