# Installation and Initial Setup

In [None]:
%%info

Install the necessary packages

In [None]:
from pyspark import SparkContext

sc = SparkContext(appName="MyApp")
sc.install_pypi_package("boto3==1.20.24")  # AWS SDK for Python
sc.install_pypi_package("pyspark==3.2.1")  # PySpark for distributed data processing
sc.install_pypi_package("nltk==3.6.5")  # Natural Language Toolkit for text processing
sc.install_pypi_package("spacy==3.2.0")  # SpaCy for advanced NLP tasks
sc.install_pypi_package("scikit-learn==1.0.2")  # Machine learning library
sc.install_pypi_package("pandas==1.0.5")  # Data analysis and manipulation
sc.install_pypi_package("matplotlib==3.2.1")  # Plotting library for visualizations
sc.install_pypi_package("seaborn==0.11.2")  # Statistical data visualization
sc.install_pypi_package("tqdm==4.62.3")  # Progress bar for loops
sc.install_pypi_package("wordcloud==1.8.1")  # For generating word clouds

Import the installed packages from the previous block

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, split, udf
from pyspark.sql.types import StringType, IntegerType, FloatType

import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from wordcloud import WordCloud

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import boto3
from tqdm import tqdm
import re
import string
import json


# Loading Data

Load all data from S3 into a Spark dataframe object 

In [None]:
combined_df = spark.read.csv('s3://imp/Combined_News_DJIA.csv', sep=r'\t', header=True)
stock_prices = spark.read.csv('s3://imp/upload_DJIA_table.csv', sep=r'\t', header=True)

In [None]:
combined_df.printSchema()

In [None]:
combined_df.select * .show(5, truncate=False)

In [None]:
stock_prices.printSchema()

In [None]:
stock_prices.select * .show(5, truncate=False)