In [None]:
# Author: Omar Bheda
# Date: 10/22/2020
# Version: 1.3
# Description: A Jupyter Notebook to automate the execution and parsing of key OSINT processes into an excel deliverable for further penetration testing activities.

# START TIME

In [None]:
from datetime import datetime

now = datetime.now()

start_time = now.strftime("%H:%M:%S")
print("Start Time =", start_time)

# Introduction
Open Source Intelligence Gathering (OSINT) is the first step in a targeted attack such as in a penetration test or red team activity. While there are ways and means to do this covertly, intelligence gathering typically starts with scraping information from public sources, collectively known as open source intelligence or OSINT. 

Detailed in this Jupyter Notebook is the methodology utilized by the Dallas Lab to conduct OSINT for clients. This Jupyter Notebook will also serve to standardize OSINT techniques across lab resources and train resources on a common methodology.

# Setup
Prior to running any cells assign a value to the variables and run the cell. This will change the targets for subdomain enumeration without needing to modify the script parameters. An example of an acceptable domain to replace the parameter with would be: "protiviti.com". Next set the ORG_NAME variable to the name of the target org.

In [None]:
#Set DOMAIN & FOLDER_NAME Variable
DOMAIN = "example.com"
ORG_NAME = "example"
FOLDER_NAME = "example"
!mkdir $FOLDER_NAME

In [None]:
print("The Following variables have been set:")
print("")
print("DOMAIN NAME: " + DOMAIN)
print("FOLDER NAME: " + FOLDER_NAME)
print("")

## Tool Initialization Script

The Tool Initialization Script performs a "locate" on a script to be executed during the OSINT process. The path of the script from the "locate" function is added to a dictionary and referenced throughout the PROSINT notebook.

In [None]:
#Tool Init Script

#System Call Function
configDict = {} 
def systemCall( cmd ):
    response = !{cmd}
    return response[0]

#Update System Database
!updatedb

#DNSProbe & Assetfinder copied to local path via setup script. Are not showed in tool initialization script.

#subfinder
subPath = systemCall( "locate subfinder/subfinder" )
print( subPath )
subVersion = "2.3.2"
print( subVersion )

configDict['subfinder'] = { 'path': subPath, 'version': subVersion }
print("")

#Shodan Scraper
shodanPath = systemCall( "locate shodanScraper.py" )
print( shodanPath )
shodanVersion = "N/A"
print( shodanVersion )

configDict['shodan'] = { 'path': shodanPath, 'version': shodanVersion }
print("")

#GitDorker
GitDorkerPath = systemCall( "locate GitDorker.py" )
print( GitDorkerPath )
GitDorkerVersion = "n/a"
print( GitDorkerVersion )

configDict['GitDorker'] = { 'path': GitDorkerPath, 'version': GitDorkerVersion }
print("")

#Infoga
infogaPath = systemCall( "locate infoga.py" )
print( infogaPath )
infogaVersion = "N/A"
print( infogaVersion )

configDict['infoga'] = { 'path': infogaPath, 'version': infogaVersion }
print("")

#CloudEnum
cloudenumPath = systemCall( "locate cloud_enum.py" )
print( cloudenumPath )
cloudenumVersion = "n/a"
print( cloudenumVersion )

configDict['cloudenum'] = { 'path': cloudenumPath, 'version': cloudenumVersion }
print("")

print("TOOL DICTIONARY")
print( repr( configDict ) )

# Subdomain Enumeration
Subdomain enumeration is the process of finding subdomains for one or more domain(s). Subdomain enumeration can reveal domains/sub-domains that are in scope of a security assessment but may not have been mentioned within the Rules of Engagement (ROE). This can come in handy when given a small scope as one can confirm with the client whether or not the newly discovered domains/sub-domains are in scope, thereby increaseing the chances of discovering vulnerabilities

## subfinder
subfinder is a subdomain discovery tool that discovers valid subdomains for websites by using passive online sources.
It has a simple modular architecture and is optimized for speed. subfinder is built for doing one thing only - passive 
subdomain enumeration, and it does that very well.

In [None]:
!{configDict['subfinder']['path']} -d $DOMAIN | tee $FOLDER_NAME/domains.csv
print("COMPLETE")

## Assetfinder

Assetfinder is a tool written by Tomnomnom designed to enumerate subdomains of websites using various sources. 

In [None]:
print("Scraping domains from assetfinder...")
print("")
!echo $DOMAIN | assetfinder --subs-only | tee -a $FOLDER_NAME/domains.csv
print("")
print("Scraping complete!")

## Formatting & Deduping Subdomains
The bash commands below remove the "www." appended to domains and dedupes them in order to pass the domains into the Bulk IP Lookup script

In [None]:
!sed -i 's/www.//g' $FOLDER_NAME/domains.csv
!sed -i 's/<BR>/\n/g' $FOLDER_NAME/domains.csv
!sort -u $FOLDER_NAME/domains.csv > $FOLDER_NAME/formatting.txt
!cat $FOLDER_NAME/formatting.txt > $FOLDER_NAME/domains.csv
!rm $FOLDER_NAME/formatting.txt

print("Complete")

## Bulk Reverse DNS
This tool by projectdiscovery will perform a reverse DNS lookup on a file containing subdomains ("domains.txt" outputted from subdomain enumeration).

In [None]:
print("Starting DNS probe for valid domains...")
print("")
!dnsprobe -l $FOLDER_NAME/domains.csv | tr ' ' ',' | tee $FOLDER_NAME/reversedns.csv 
!cat $FOLDER_NAME/reversedns.csv  | cut -d',' -f2 | sort -u > $FOLDER_NAME/IPs.csv 
print("")
print("DNS Probe finsished!")

<!-- ## DNSTwist
DNSTwist allows one to find similar-looking domains that adversaries can use to attack the target domain. It can detect typosquatters, phishing attacks, fraud and corporate espionage and is useful as an additional source of targeted threat intelligence. -->

# Email Gathering
Email Gathering is the process of finding emails to be leveraged during Active Testing. The most common example being conducting password guessing attempts on client systems. The objective of this process should be to obtain as many client emails as possible to improve the chances of password spraying. 


## Infoga
Infoga is a tool gathering email account informations (ip,hostname,country,...) from different public sources (search engines, pgp key servers and shodan) and checks if emails were leaked using the haveibeenpwned.com API.

In [None]:
!python {configDict['infoga']['path']} -d $DOMAIN -s all -r infoga.txt
!cat infoga.txt | grep + | cut -d' ' -f3 > $FOLDER_NAME/emails.csv 
!sed 's/$/,infoga/' $FOLDER_NAME/emails.csv  > $FOLDER_NAME/formattedemails 
!(echo "EMAIL,SOURCE" && cat $FOLDER_NAME/formattedemails ) > $FOLDER_NAME/emails.csv 
!rm $FOLDER_NAME/formattedemails 
!rm infoga.txt
print("COMPLETE")

## Shodan

In [None]:
!python3 {configDict['shodan']['path']} $FOLDER_NAME/IPs.csv shodan
!mv --target-directory=$FOLDER_NAME shodan.csv

# Vendor Service Enumeration 

## Cloudenum

Multi-cloud OSINT tool. Enumerate public resources in AWS, Azure, and Google Cloud. Add in multiple "-k" arguments for multiple multiple keywords or use the "-kf" argument to specify a file with a list of keywords seperated by line.

Example: -k somecompany -k somecompany.io -k somecompany.com

Current setup takes in a keyfile, which in this case is the formatted subdomains we are feeding into cloudenum by specifying "-kf" rather than multiple keywords. 

In [None]:
print("Cloudenum in progress")
!python3 {configDict['cloudenum']['path']} -k $ORG_NAME -t 50 --disable-gcp --disable-azure -l cloudenum.csv
!mv --target-directory=$FOLDER_NAME/ cloudenum.csv
print("")
print("Cloudenum Complete")

## GitDorker
A tool perform a variety of github dorks and identify sensitive information exposure. Need to provide a GitHub Token.

Full documentation available here: __[GitDorker Documentation](http://tinyurl.com/GitDorker)__

In [None]:
GH_FILE_NAME = FOLDER_NAME + "_gh_dorks.csv"
!python3 {configDict['GitDorker']['path']} -tf tokensfile.txt -q $DOMAIN -d dorks/alldorks.txt -o $FOLDER_NAME
!mv --target-directory=$FOLDER_NAME/ $GH_FILE_NAME

# OSINT Parsing Script & Excel Deliverable Serialization

In [None]:
#REQ: import requirements: pandas, openpyxl
import pandas as pd
import os
import csv
import glob
import xlsxwriter
import openpyxl


#path to parse to and read files from
path = "/root/.jupyter/{}/".format(FOLDER_NAME)

#all files ending in .csv
all_files = glob.glob(os.path.join(path, "*.csv"))

#initialize writer
writer = pd.ExcelWriter('/root/.jupyter/' + FOLDER_NAME + '/' + FOLDER_NAME + '_OSINT.xlsx', engine='openpyxl')

#write all files into excel from dataframes and name worksheet by filename 
print("The following files are being parsed to " + path + ":") 
print("")
for f in all_files:
    df = pd.read_csv(f)
    print(f)
    df.to_excel(writer, sheet_name=os.path.basename(f))

writer.save()  

print("")
print("Parsing of " + FOLDER_NAME + "_OSINT.xlsx Complete")

# END TIME

In [None]:
now = datetime.now()

end_time = now.strftime("%H:%M:%S")
print("End Time =", end_time)