In [16]:
import os
import zipfile
import pandas as pd
import json

In [17]:
def read_zipped_csvs(folder_path):
    # List all files in the folder
    all_files = os.listdir(folder_path)

    # Filter out only zip files
    zip_files = [file for file in all_files if file.endswith('.zip')]

    # Initialize a list to store individual DataFrames
    dataframes = []

    # Loop through each zip file
    for zip_file in zip_files:
        print(zip_file)
        # Open the zip file
        with zipfile.ZipFile(os.path.join(folder_path, zip_file), 'r') as z:
            # Loop through each file inside the zip file
            for file in z.namelist():
                # Check if the file is a CSV
                if file.endswith('.csv'):
                    # Read the CSV file into a DataFrame
                    df = pd.read_csv(z.open(file))
                    print(df.shape[0])

                    # Add the DataFrame to the list of DataFrames
                    dataframes.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(dataframes, ignore_index=True)

    return combined_df

In [18]:
folder_path = 'D:/kickstarter_data'
combined_df = read_zipped_csvs(folder_path)
print(combined_df.head())

Kickstarter_2022-01-20T03_20_11_451Z.zip
3672
3662
3662
3664
3656
3663
3660
3647
827
Kickstarter_2022-02-10T03_20_20_292Z.zip
3654
3649
3656
3662
3692
3669
3652
3554
Kickstarter_2022-03-24T03_20_19_285Z.zip
3640
3663
3669
3672
3666
3677
3683
3663
3653
1283
Kickstarter_2022-04-21T03_20_08_060Z.zip
3660
3663
3663
3662
3696
3681
3657
3664
1646
Kickstarter_2022-05-19T03_20_05_346Z.zip
3654
3665
3661
3657
3655
3660
3657
3667
3652
3660
3643
3664
3656
3668
3659
3659
3647
3666
3665
3663
3663
3661
3647
3662
3658
3665
3662
3657
3655
3654
3661
3658
3668
3658
3661
3641
3668
3662
3665
3656
3660
3649
3656
3660
3663
3669
3659
3656
3658
3656
3656
3667
3658
3655
3648
3666
3657
3666
3655
3665
3651
3662
3155
Kickstarter_2022-06-09T03_20_03_365Z.zip
3653
3667
3669
3662
3660
3662
3649
3663
3667
3662
3662
3662
3643
3656
3671
3661
3661
3672
3643
3660
3672
3666
3666
3663
3648
3665
3660
3670
3657
3661
3658
3650
3664
3675
3660
3665
3667
3648
3658
3664
3667
3658
3662
3660
3651
3662
3671
3659
3658
3660
3646
3664


In [19]:
combined_df.shape[0]

616363

In [20]:
#convert urls column from string to dict
combined_df['urls'] = combined_df['urls'].apply(lambda x: json.loads(x))
#extract project url from the urls column
combined_df['link_to_project'] = combined_df['urls'].apply(lambda x: x.get('web', {}).get('project'))


In [21]:
filtered_df = combined_df[combined_df['state'].isin(["successful", "failed"])]


In [22]:
filtered_df.shape[0]

574665

In [81]:
unique_count = filtered_df['id'].nunique()

In [82]:
unique_count

193343

In [86]:
filtered_df_1 = filtered_df[filtered_df['id'] == 1535549790]


In [87]:
filtered_df_1

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,country_displayable_name,created_at,creator,currency,currency_symbol,...,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_exchange_rate,usd_pledged,usd_type,link_to_project
3672,336,A quarterly magazine on design impact.,"{""id"":359,""name"":""Print"",""analytics_name"":""Pri...",34518.0,US,the United States,1575503001,"{""id"":116675903,""name"":""Design Museum Boston"",...",USD,$,...,True,True,successful,1584023419,1.0,{'web': {'project': 'https://www.kickstarter.c...,1.0,34518.0,domestic,https://www.kickstarter.com/projects/designmus...
49697,336,A quarterly magazine on design impact.,"{""id"":359,""name"":""Print"",""analytics_name"":""Pri...",34518.0,US,the United States,1575503001,"{""id"":116675903,""name"":""Design Museum Boston"",...",USD,$,...,True,True,successful,1584023419,1.0,{'web': {'project': 'https://www.kickstarter.c...,1.0,34518.0,domestic,https://www.kickstarter.com/projects/designmus...
74624,336,A quarterly magazine on design impact.,"{""id"":359,""name"":""Print"",""analytics_name"":""Pri...",34518.0,US,the United States,1575503001,"{""id"":116675903,""name"":""Design Museum Boston"",...",USD,$,...,True,True,successful,1584023419,1.0,{'web': {'project': 'https://www.kickstarter.c...,1.0,34518.0,international,https://www.kickstarter.com/projects/designmus...
113830,336,A quarterly magazine on design impact.,"{""id"":359,""name"":""Print"",""analytics_name"":""Pri...",34518.0,US,the United States,1575503001,"{""id"":116675903,""name"":""Design Museum Boston"",...",USD,$,...,True,True,successful,1584023419,1.0,{'web': {'project': 'https://www.kickstarter.c...,1.0,34518.0,international,https://www.kickstarter.com/projects/designmus...
271463,336,A quarterly magazine on design impact.,"{""id"":359,""name"":""Print"",""analytics_name"":""Pri...",34518.0,US,the United States,1575503001,"{""id"":116675903,""name"":""Design Museum Boston"",...",USD,$,...,True,True,successful,1584023419,1.0,{'web': {'project': 'https://www.kickstarter.c...,1.0,34518.0,international,https://www.kickstarter.com/projects/designmus...
313927,336,A quarterly magazine on design impact.,"{""id"":359,""name"":""Print"",""analytics_name"":""Pri...",34518.0,US,the United States,1575503001,"{""id"":116675903,""name"":""Design Museum Boston"",...",USD,$,...,True,True,successful,1584023419,1.0,{'web': {'project': 'https://www.kickstarter.c...,1.0,34518.0,international,https://www.kickstarter.com/projects/designmus...
419263,336,A quarterly magazine on design impact.,"{""id"":359,""name"":""Print"",""analytics_name"":""Pri...",34518.0,US,the United States,1575503001,"{""id"":116675903,""name"":""Design Museum Boston"",...",USD,$,...,True,True,successful,1584023419,1.0,{'web': {'project': 'https://www.kickstarter.c...,1.0,34518.0,domestic,https://www.kickstarter.com/projects/designmus...
483345,336,A quarterly magazine on design impact.,"{""id"":359,""name"":""Print"",""analytics_name"":""Pri...",34518.0,US,the United States,1575503001,"{""id"":116675903,""name"":""Design Museum Boston"",...",USD,$,...,True,True,successful,1584023419,1.0,{'web': {'project': 'https://www.kickstarter.c...,1.0,34518.0,international,https://www.kickstarter.com/projects/designmus...


In [27]:
unique_df = filtered_df.drop_duplicates(subset='blurb', keep='first')


In [28]:
unique_df.shape[0]

191867

In [29]:
unique_df['link_to_project'][1]

'https://www.kickstarter.com/projects/559623833/chirault-volume-2?ref=discovery_category_newest'

In [30]:
unique_df['cleaned_link_to_project'] = unique_df['link_to_project'].apply(lambda x: x.split('?')[0])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_df['cleaned_link_to_project'] = unique_df['link_to_project'].apply(lambda x: x.split('?')[0])


In [33]:
unique_df['cleaned_link_to_project'][0]

'https://www.kickstarter.com/projects/1881653671/medcomic-the-most-entertaining-way-to-study-medici'

In [48]:
import requests
from bs4 import BeautifulSoup

def scrape_data(urls, css_selector):
    data = []

    for url in urls:
        # Send an HTTP request to get the content of the webpage
        response = requests.get(url)

        # If the request is successful, parse the content
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            elements = soup.select(css_selector)
            extracted_data = [element.get_text() for element in elements]
            data.append(extracted_data)
        else:
            data.append([])  # Add an empty list for failed requests

    return data

In [52]:
#urls = unique_df['cleaned_link_to_project'].tolist()
urls = unique_df['cleaned_link_to_project'][0:3].tolist()
print(urls)
css_selector = '#content-wrap > section > div.project-profile__content > div.grid-container.pb3.pb10-sm > div > div.grid-col-12.grid-col-8-lg > div > div > div > img'  # Replace this with the appropriate CSS selector for the data you want to scrape
scraped_data = scrape_data(urls, css_selector)

# Add the scraped data as a new column in the DataFrame
print(scraped_data)

['https://www.kickstarter.com/projects/1881653671/medcomic-the-most-entertaining-way-to-study-medici', 'https://www.kickstarter.com/projects/559623833/chirault-volume-2', 'https://www.kickstarter.com/projects/weregeekcomics/weregeek-the-final-showdown']
[[], [], []]


In [105]:
with open('C:/Users/amito/Downloads/proxylist.txt', 'r') as f:
    proxy_list = f.read().splitlines()

In [106]:
proxy_list

['80.48.119.28:8080',
 '92.27.165.234:80',
 '8.219.176.202:8080',
 '82.148.6.193:80',
 '137.184.245.154:80',
 '103.219.193.174:80',
 '139.99.237.62:80',
 '190.61.88.147:8080',
 '45.56.119.212:8015',
 '103.142.141.71:80',
 '20.111.54.16:80',
 '20.210.113.32:80',
 '20.206.106.192:80',
 '20.24.43.214:80',
 '217.76.50.200:8000',
 '129.150.36.169:8090',
 '20.205.61.143:80',
 '169.55.89.6:80',
 '103.151.41.7:80',
 '103.216.103.163:80',
 '168.11.52.41:8080',
 '128.199.202.122:3128',
 '58.27.59.249:80',
 '45.62.167.249:80',
 '103.152.112.234:80',
 '141.11.184.230:80',
 '152.69.215.206:80',
 '139.99.135.214:80',
 '165.154.236.214:80',
 '167.71.205.47:8080',
 '138.117.219.108:80',
 '95.183.140.94:80',
 '95.183.140.89:80',
 '38.7.207.92:80',
 '139.59.1.14:3128',
 '134.19.254.2:21231',
 '47.243.50.83:8080',
 '34.146.64.228:3128',
 '217.79.253.106:80',
 '196.223.129.21:80',
 '154.118.228.212:80',
 '195.154.106.167:80',
 '103.141.142.153:41401',
 '8.218.239.151:3128',
 '34.23.45.223:80',
 '182.16.12

In [42]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from fake_useragent import UserAgent

# Create a UserAgent object
user_agent = UserAgent()

options = webdriver.ChromeOptions()
#options.add_argument('--headless')

#options.add_argument('--disable-gpu')


options.add_argument(f'user-agent={user_agent.random}')


options.add_argument('--proxy-server=http://unblock.oxylabs.io:60000')
options.add_argument('--proxy-auth=amitoj1996:Amitoj#1996')

url = 'https://www.kickstarter.com/projects/1881653671/medcomic-the-most-entertaining-way-to-study-medici'
chromedriver_path = 'C:/Users/amito/Downloads/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chromedriver_path,options=options)
try:
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    page_contents = soup.prettify()
    #driver.quit()
except Exception as e:
    print(f'Error: {e}')
    print(f'Switching to next proxy: {proxy_list[proxy_list.index(proxy) + 1]}')


    
target_paragraph = soup.find('p', text="Finally, studying is fun. Medcomic's combination of art, humor, and medicine makes it easy to recall information and ace exams. Purchase the book today!")


  driver = webdriver.Chrome(executable_path=chromedriver_path,options=options)


In [40]:
print(target_paragraph)

None


In [14]:
import requests
import queue
import threading

q = queue.Queue()
valid_proxies=[]

with open('C:/Users/amito/Downloads/proxylist.txt', 'r') as f:
    proxy_list = f.read().split("\n")
    for p in proxy_list:
        q.put(p)
        

def check_proxies():
    global q
    while not q.empty():
        proxy = q.get()
        try:
            res = requests.get("http://ipinfo.io/json",proxies = {"https": proxy})
        except:
            continue
        if res.status_code == 200:
            print(proxy)

In [15]:
for _ in range(10):
    threading.Thread(target=check_proxies).start()

80.48.119.28:8080
8.219.176.202:8080
92.27.165.234:80
137.184.245.154:80
82.148.6.193:80
190.61.88.147:8080
103.219.193.174:80
139.99.237.62:80
45.56.119.212:8015
103.142.141.71:80
20.111.54.16:80
20.206.106.192:80
217.76.50.200:8000
20.210.113.32:80
20.24.43.214:80
129.150.36.169:8090
20.205.61.143:80
169.55.89.6:80
103.216.103.163:80
103.151.41.7:80
168.11.52.41:8080
128.199.202.122:3128
58.27.59.249:80
103.152.112.234:80
45.62.167.249:80
141.11.184.230:80
152.69.215.206:80
139.99.135.214:80
165.154.236.214:80
167.71.205.47:8080
95.183.140.94:80
138.117.219.108:80
95.183.140.89:80
47.243.50.83:8080
134.19.254.2:21231
38.7.207.92:80
139.59.1.14:3128
34.146.64.228:3128
196.223.129.21:80
217.79.253.106:80
154.118.228.212:80
195.154.106.167:80
34.23.45.223:80
103.141.142.153:41401
8.218.239.151:3128
182.16.12.26:8088
202.146.4.101:3128
192.248.125.3:80
103.147.247.15:3127
65.18.114.254:55443
173.201.183.110:80
102.134.98.222:8081
187.17.163.223:92
82.167.252.100:8080
175.106.10.98:3128
1

In [35]:
with open('C:/Users/amito/Downloads/validproxieslist.txt', 'r') as f:
    proxies = f.read().split("\n")
    
sites_to_check = ['https://www.kickstarter.com/projects/1881653671/medcomic-the-most-entertaining-way-to-study-medici', 'https://www.kickstarter.com/projects/559623833/chirault-volume-2', 'https://www.kickstarter.com/projects/weregeekcomics/weregeek-the-final-showdown']

counter = 0

for site in sites_to_check:
    try:
        print(f"using the proxy: {proxies[counter]}")
        res = requests.get(site, proxies={"https": proxies[counter]})
        print(res.status_code)
    except:
        print("failed")
    finally:
        counter += 1

using the proxy: 80.48.119.28:8080
failed
using the proxy: 8.219.176.202:8080
403
using the proxy: 92.27.165.234:80
failed


In [43]:
import pprint

In [51]:
from pprint import pprint
import requests

# Structure payload.
payload = {
    'source': 'universal',
    'url': 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
    'user_agent_type': 'desktop',
    'geo_location': 'United States'
#     'context': [
#         {
#             'key': 'session_id',
#             'value': '1234567890abcdef'
#         },
#         {
#             'key': 'headers',
#             'value': {
#                 'Content-Type': 'application/octet-stream',
#                 'Custom-Header': 'custom header content'
#             },
#         },
#         {
#             'key': 'cookies',
#             'value': [
#                 {
#                     'key': 'NID',
#                     'value': '1234567890'
#                 },
#                 {
#                     'key': '1P_JAR',
#                     'value': '0987654321'
#                 },
#             ],
#         },
#         {
#             'key': 'follow_redirects',
#             'value': True
#         },
#         {
#             'key': 'successful_status_codes',
#             'value': [200, 201, 202]
#         },
#         {
#             'key': 'http_method',
#             'value': 'get'
#         },
#         {
#             'key': 'content',
#             'value': 'ZGF0YT0xMjM='  # base64 encoded post body
#         },
#     ],
}

# Get response.
response = requests.request(
    'POST',
    'https://realtime.oxylabs.io/v1/queries',
    auth=('amitoj1996', 'Amitoj#1996'),
    json=payload,
)

# Instead of response with job status and results url, this will return the
# JSON response with the result.
pprint(response.json())

{'message': 'Unauthorized'}


In [62]:
import requests
import json


In [63]:
OXYLABS_USERNAME = 'amitoj1996'
OXYLABS_PASSWORD = 'Amitoj#1996'
OXYLABS_ENDPOINT = 'https://realtime.oxylabs.io/v1/queries'
TARGET_URL = 'https://www.kickstarter.com/projects/1881653671/medcomic-the-most-entertaining-way-to-study-medici'



In [64]:
def fetch_oxylabs(url):
    headers = {
        'Content-Type': 'application/json'
    }

    payload = {
        'source': 'universal',
        'url': url
    }

    try:
        response = requests.post(
            OXYLABS_ENDPOINT,
            auth=(OXYLABS_USERNAME, OXYLABS_PASSWORD),
            json=payload,
            headers=headers,
            timeout=60
        )

        if response.status_code == 200:
            data = response.json()
            return data['data']['content'] if 'data' in data and 'content' in data['data'] else None
        else:
            print(f"Error: Status code {response.status_code}")
            return None
    except Exception as e:
        print(f"Error: {e}")
        return None


In [67]:
html_content = fetch_oxylabs(TARGET_URL)

# Process the content using BeautifulSoup, lxml, or other HTML parsing libraries as needed.
# For example, with BeautifulSoup:
# from bs4 import BeautifulSoup
# soup = BeautifulSoup(html_content, 'html.parser')
# ...


Error: Status code 401


In [95]:
from pprint import pprint
import requests
import os

# Structure payload.
output_dir = "D:/kickstarter_json/"

def fetch_data(url):

    payload = {
    'source': 'universal',
    'url': url,
    'user_agent_type': 'desktop',
    'geo_location': 'United States',
    
    }
    # Get response.
    response = requests.request(
    'POST',
    'https://realtime.oxylabs.io/v1/queries',
    auth=('amitoj1996', 'Amitoj1996'),
    json=payload,
    )
    return response.json()
# Instead of response with job status and results url, this will return the
# JSON response with the result.


In [96]:
def get_id_1(url):
    id_1  = unique_df.loc[unique_df['cleaned_link_to_project'] == url,'id'].values[0]
    return id_1

In [97]:
def process_url(url):
    #url = row['cleaned_link_to_project']
    
    id_1 = get_id_1(url)
    
    
    file_name = f'response_{id_1}.json'
    output_path = os.path.join(output_dir, file_name)
    
    if os.path.exists(output_path):
        print(f"Skipping {id_1}, file already exists.")
        return
    
    # Fetch data
    data = fetch_data(url)
    # Save the JSON response to a file for each URL
    with open(output_path, 'w') as outfile:
        json.dump(data, outfile)

In [99]:
import os
import concurrent.futures

start = 1000
end = 4000


urls = unique_df['cleaned_link_to_project'].iloc[start:end].tolist()

max_workers = 16  # Adjust this value based on your machine and network capacity
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    executor.map(process_url, urls)

In [92]:
for index, row in unique_df.head(1000).iterrows():
    url = row['cleaned_link_to_project']
    id_1 = row['id']
    
    
    file_name = f'response_{id_1}.json'
    output_path = os.path.join(output_dir, file_name)
    
    if os.path.exists(output_path):
        print(f"Skipping {id_1}, file already exists.")
        continue
    
    # Fetch data
    data = fetch_data(url)
    # Save the JSON response to a file for each URL
    with open(output_path, 'w') as outfile:
        json.dump(data, outfile)

Skipping 1767289981, file already exists.
Skipping 1210231263, file already exists.


KeyboardInterrupt: 