# Import libraries

In [1]:
from bs4 import BeautifulSoup
import urllib
import re

import pandas as pd

# Getting the data

In [2]:
base_site = 'https://en.wikipedia.org/wiki/List_of_car_brands'
source = urllib.request.urlopen(base_site).read()

# Parsing the data

In [3]:
soup = BeautifulSoup(source, 'lxml')

# Fetching the data

In [4]:
text = ''
first_story_paragraph = soup.find_all(["h2", "li"])
for i in first_story_paragraph:
    text += (i.text + '\n ')

# Preprocessing the data

In [5]:
text0 = text[4027:].lower()
text0[:100]

'argentina[edit]\n zanella (1948–present)\n anasagasti (1911–1915)\n andino (1967–1973)\n asa (1961– 1969'

In [6]:
# Identify countries
text1 = re.sub(r'\[edit\]', '-country', text0)
text1[:100]

'argentina-country\n zanella (1948–present)\n anasagasti (1911–1915)\n andino (1967–1973)\n asa (1961– 19'

In [7]:
# Remove info inside (parantheses) and [brackets]
text2 = re.sub(r'[\(\[].*?[\)\]]', ' ', text1)
text2[:100]

'argentina-country\n zanella  \n anasagasti  \n andino  \n asa  \n eniak  \n hispano-argentina  \n industria'

In [8]:
# Remove (parantheses), [square brackets] and {curly bracketes}
text3 = re.sub(r'[()[\]{}]', ' ', text2)
set(text2).symmetric_difference(set(text3))

{'(', ')', '{', '}'}

In [9]:
# remove digits
text4 = re.sub(r'\d', ' ', text3)
set(text3).symmetric_difference(set(text4))

{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}

In [10]:
# split text by newline (\n) & slice to get desired data
text5 = text4.split('\n')
text5[:5]

['argentina-country', ' zanella  ', ' anasagasti  ', ' andino  ', ' asa  ']

In [11]:
data = text5.copy()

In [12]:
# https://stackoverflow.com/questions/52551398/slicing-a-list-into-sublists-based-on-condition
data[0] = re.sub(r'-country','',data[0])

arrays = [[data[0]]]

for i in range(1, len(data)):
    if 'country' not in data[i]:
        arrays[len(arrays)-1].append(data[i].strip()) #removing whitespace
    else:
        data[i] = re.sub(r'-country','',data[i])
        arrays.append([data[i]])

# Exporting the data

In [13]:
# Getting the right data, in the right shape
df = pd.DataFrame(arrays[:-2]).T

# Setting the header
header = df.iloc[0]
header = [country.strip() for country in header]

# Excluding first row (header) and building df
df = df[1:]
df.columns = header

# Exploring df
df.head()

Unnamed: 0,argentina,australia,austria,azerbaijan,belgium,bosnia and herzegovina,brazil,bulgaria,canada,china,...,thailand,tunisia,turkey,uganda,ukraine,united arab emirates,united kingdom,united states,uruguay,vietnam
1,zanella,alpha sports,eurostar automobilwerk,ga,ecar,pretis,abais,litex motors,electrameccanica,baic group,...,akepanich,barkia,anadol,kiira,zaz,devel motors,ac cars,am general,nordex,chienthang
2,anasagasti,arrow,ktm,khazar,edran,tas,adamo gt,sin cars,htt,baolong,...,c-fee,industries mécaniques maghrébines,devrim,,,shayton,arash,anteros,dellepiane,la dalat
3,andino,birchfield,magna steyr,naz,gillet,,agrale,bulgaralpine,intermeccanica,beijing automotive industry holding corporation,...,cherdchai,wallyscar,diardi,,,w motors,ariel,arcimoto,el terruno,thaco
4,asa,bolwell,öaf,aziz,imperia automobiles,,americar,bulgarrenault,wingho,beijing automobile works,...,deva,,etox,,,zarooq motors,aston martin,aurica,grumett,vinfast
5,eniak,borland racing developments,puch,,adk,,amoritz gt,moskvitch,acadian,beijing automobile works,...,kwaithong,,evt s,,,,bac,bollinger motors,guitolar,vinaxuki


In [14]:
df.to_csv('scraped_brands_paaer_country.csv', index = False)