# Import libraries

In [1]:
from bs4 import BeautifulSoup
import urllib
import re

import pandas as pd

# Getting the data

In [2]:
base_site = 'https://en.wikipedia.org/wiki/List_of_current_automobile_manufacturers_by_country'
source = urllib.request.urlopen(base_site).read()

# Parsing the data

In [3]:
soup = BeautifulSoup(source, 'lxml')

# Fetching the data

In [4]:
text = ''
first_story_paragraph = soup.find_all(["h2", "li"])
for i in first_story_paragraph:
    text += (i.text + '\n ')

# Preprocessing the data

In [5]:
text0 = text.lower()

In [6]:
# Remove (parantheses), [square brackets] and {curly bracketes}
text1 = re.sub(r'[()[\]{}]', ' ', text0)
set(text0).symmetric_difference(set(text1))

{'(', ')', '[', ']'}

In [7]:
# remove digits
text2 = re.sub(r'\d', ' ', text1)
set(text1).symmetric_difference(set(text2))

{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}

In [8]:
# split text by newline (\n) & slice to get desired data
text3 = text2.split('\n ')
data = text3[9:310]
data[:5]

['africa edit ',
 'snvi          ',
 'egy-tech engineering',
 'kantanka cars      ',
 'mobius motors          ']

In [9]:
# Removing " -present"
for i in range(len(data)):
    data[i] = re.sub(r'\s+\W+\w+', '', data[i])

In [10]:
# Removing \n...
for i in range(len(data)):
    data[i] = re.sub(r'\n\w+', '',data[i])

In [11]:
# https://stackoverflow.com/questions/52551398/slicing-a-list-into-sublists-based-on-condition
data[0] = re.sub(r' edit','',data[0])

arrays = [[data[0]]]

for i in range(1, len(data)):
    if 'edit' not in data[i]:
        arrays[len(arrays)-1].append(data[i].strip()) #removing whitespace
    else:
        data[i] = re.sub(r' edit','',data[i])
        arrays.append([data[i]])

# Exporting the data

In [12]:
df = pd.DataFrame(arrays).T
header = df.iloc[0]
df = df[1:]
df.columns = header
df

Unnamed: 0,africa,asia,europe,oceania,north america,south america
1,snvi,aftab automobiles,magna steyr,alpha sports,general motors,chamonix
2,egy-tech engineering,bmtf,ganja auto plant,bolwell,buick,fabral
3,kantanka cars,jamuna automobiles,khazar,borland racing developments,cadillac,tac
4,mobius motors,niloy-hero motors venture between hero motocor...,naz,devaux cars,chevrolet,troller
5,laraki,php automobiles,belgee,elfin sports cars,gmc,
...,...,...,...,...,...,...
134,,,mclaren,,,
135,,,mini,,,
136,,,morgan,,,
137,,,rolls-royce,,,


In [13]:
df.to_csv('scraped_car_origin.csv', index = False)